11.6 C
Canberra
Thursday, June 11, 2026

Multimodal Browser AI with Transformers.js for Pictures and Speech


<title>Multimodal Media Analyzer</title>

  

    * { fieldsizing: borderfield; margin: 0; padding: 0; }

    physique {

      fonthousehold: systemui, sansserif;

      maxwidth: 820px;

      margin: 0 auto;

      padding: 1.5rem 1rem;

      background: #f1f5f9;

      colour: #1e293b;

    }

 

    header { marginbackside: 1.5rem; }

    header h1 { fontdimension: 1.5rem; }

    header p  { colour: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

 

    /* Mannequin standing indicators */

    .mannequinstandingbar {

      show: flex;

      hole: 0.5rem;

      flexwrap: wrap;

      marginhigh: 0.75rem;

    }

    .mannequinbadge {

      fontdimension: 0.78rem;

      padding: 0.2rem 0.6rem;

      borderradius: 12px;

      background: #fef3c7;

      colour: #92400e;

    }

    .mannequinbadge.prepared { background: #dcfce7; colour: #15803d; }

 

    /* Tab bar */

    .tabs {

      show: flex;

      background: white;

      borderradius: 8px;

      padding: 0.25rem;

      hole: 0.25rem;

      marginbackside: 1.25rem;

      border: 1px stable #e2e8f0;

    }

    .tab {

      flex: 1;

      padding: 0.5rem;

      textual contentalign: heart;

      borderradius: 6px;

      cursor: pointer;

      fontdimension: 0.9rem;

      colour: #64748b;

      transition: all 0.15s;

    }

    .tab.lively { background: #2563eb; colour: white; font-weight: 600; }

 

    /* Enter panels */

    .panel { show: none; }

    .panel.lively { show: block; }

 

    .addspace {

      background: white;

      border: 2px dashed #cbd5e1;

      borderradius: 8px;

      padding: 2rem;

      textual contentalign: heart;

      cursor: pointer;

    }

    .addspace enter { show: none; }

 

    #img-preview {

      marginhigh: 1rem;

      maxwidth: 100%;

      maxpeak: 320px;

      borderradius: 8px;

      show: none;

      objectmatch: cowl;

    }

 

    .micheart { textual contentalign: heart; padding: 1rem 0; }

    #rec-btn {

      width: 72px; peak: 72px;

      borderradius: 50%; border: none;

      background: #dc2626; colour: white;

      fontdimension: 1.6rem; cursor: pointer;

      show: flex; alignobjects: heart; justifycontent material: heart;

      margin: 0 auto 0.5rem;

    }

    #rec-btn.recording { background: #374151; }

    #rec-btn:disabled  { background: #94a3b8; cursor: not-allowed; }

    #rec-timer { font-weight: 600; colour: #374151; margin-bottom: 0.25rem; }

    #rec-hint  { font-size: 0.85rem; colour: #64748b; }

    #wave-canvas { show: block; margin: 0.5rem auto; border-radius: 4px; }

 

    /* Outcomes grid */

    .outcomesgrid {

      show: grid;

      gridtemplatecolumns: repeat(automatch, minmax(220px, 1fr));

      hole: 1rem;

      marginhigh: 1.25rem;

    }

    .end resultcard {

      background: white;

      border: 1px stable #e2e8f0;

      borderradius: 8px;

      padding: 1rem;

    }

    .end resultcard h3 {

      fontdimension: 0.75rem;

      textual contentrework: uppercase;

      letterspacing: 0.06em;

      colour: #64748b;

      marginbackside: 0.6rem;

    }

    .labelmerchandise {

      show: flex;

      justifycontent material: housebetween;

      alignobjects: heart;

      padding: 0.25rem 0;

      fontdimension: 0.875rem;

      borderbackside: 1px stable #f1f5f9;

    }

    .labelrating {

      fontdimension: 0.8rem;

      colour: #64748b;

      background: #f1f5f9;

      padding: 0.1rem 0.4rem;

      borderradius: 4px;

    }

    .captionphysique {

      fontdimension: 0.95rem;

      linepeak: 1.5;

      fonttype: italic;

      colour: #334155;

    }

    .transcriptphysique {

      fontdimension: 0.95rem;

      linepeak: 1.6;

      colour: #334155;

      whitehouse: prewrap;

    }

    .placeholdertextual content { colour: #94a3b8; font-style: italic; font-size: 0.9rem; }

    #global-status {

      fontdimension: 0.85rem;

      colour: #64748b;

      marginbackside: 1rem;

    }

 

    @media (maxwidth: 500px) {

      .outcomesgrid { gridtemplatecolumns: 1fr; }

    }

  

  <header>

    <h1>Multimodal Media Analyzer</h1>

    <p>Picture classification, captioning, and speech transcription all in your browser.</p>

    <div class=“model-status-bar”>

      <span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

      <span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

      <span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

    </div>

  </header>

 

  <div id=“global-status”>Loading fashions in parallel first run downloads ~400 MB whole.</div>

 

  <div class=“tabs”>

    <div class=“tab lively” informationtab=“picture”>🖼 Picture Evaluation</div>

    <div class=“tab” informationtab=“speech”>🎙 Speech Transcription</div>

  </div>

 

  <! Picture panel >

  <div class=“panel lively” id=“panel-image”>

    <div class=“upload-area” id=“img-drop”>

      

      <p>Click on or drag an picture to analyze</p>

      <p type=“font-size:0.8rem;colour:#94a3b8;margin-top:0.3rem”>

        JPG, PNG, WebP, GIF supported

      </p>

    </div>

    <img id=“img-preview” alt=“Preview” />

  </div>

 

  <! Speech panel >

  <div class=“panel” id=“panel-speech”>

    <div class=“mic-center”>

      <button id=“rec-btn” disabled>🎙</button>

      <div id=“rec-timer”>0:00</div>

      <div id=“rec-hint”>Ready for Whisper mannequin...</div>

    </div>

    

  </div>

 

  <! Outcomes proven for each modes >

  <div class=“results-grid” id=“results-grid” type=“show:none”>

    <! Picture outcomes (proven in picture mode) >

    <div class=“result-card” id=“card-cls” type=“show:none”>

      <h3>Classification</h3>

      <div id=“cls-content”>

        <p class=“placeholder-text”>No outcomes but.</p>

      </div>

    </div>

    <div class=“result-card” id=“card-cap” type=“show:none”>

      <h3>Caption</h3>

      <div id=“cap-content”>

        <p class=“placeholder-text”>No caption but.</p>

      </div>

    </div>

    <! Speech outcomes (proven in speech mode) >

    <div class=“result-card” id=“card-asr” type=“show:none”>

      <h3>Transcription</h3>

      <div id=“asr-content”>

        <p class=“placeholder-text”>File audio to see the transcription.</p>

      </div>

    </div>

  </div>

 

  

    import { pipeline }

      from ‘https://cdn.jsdelivr.web/npm/@huggingface/transformers@3.0.2’;

 

    // ── Pipeline references ───────────────────────────────────────────────

    let classifier, captioner, transcriber;

    let readyCount = 0;

 

    // Replace a mannequin badge to “prepared” state

    operate markReady(badgeId, label) {

      const badge = doc.getElementById(badgeId);

      badge.textContent = `${label}: prepared`;

      badge.classList.add(‘prepared’);

      readyCount++;

      if (readyCount === 3) {

        globalStatus.textContent =

          ‘All fashions prepared. Add a picture or file audio.’;

        recBtn.disabled = false;

        recHint.textContent = ‘Click on to begin recording.’;

      }

    }

 

    // Load all three pipelines concurrently

    Promise.all([

      pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

      }),

      pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

      }),

      pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

      })

    ]).then(([cls, cap, asr]) => {

      classifier  = cls;

      captioner   = cap;

      transcriber = asr;

    }).catch(err => {

      globalStatus.textContent = `Error loading fashions: ${err.message}`;

    });

 

    // ── UI references ─────────────────────────────────────────────────────

    const globalStatus = doc.getElementById(‘global-status’);

    const resultsGrid  = doc.getElementById(‘results-grid’);

    const recBtn       = doc.getElementById(‘rec-btn’);

    const recHint      = doc.getElementById(‘rec-hint’);

    const recTimer     = doc.getElementById(‘rec-timer’);

    const waveCanvas   = doc.getElementById(‘wave-canvas’);

    const waveCtx      = waveCanvas.getContext(‘second’);

 

    // ── Picture evaluation ────────────────────────────────────────────────────

    async operate analyzeImage(dataUrl) {

      if (!classifier || !captioner) {

        globalStatus.textContent = ‘Fashions nonetheless loading. Please wait.’;

        return;

      }

 

      globalStatus.textContent = ‘Working classification and captioning…’;

 

      // Present picture end result playing cards, disguise speech card

      doc.getElementById(‘card-cls’).type.show = ‘block’;

      doc.getElementById(‘card-cap’).type.show = ‘block’;

      doc.getElementById(‘card-asr’).type.show = ‘none’;

      resultsGrid.type.show = ‘grid’;

 

      doc.getElementById(‘cls-content’).innerHTML =

        

Classifying…

;

      doc.getElementById(‘cap-content’).innerHTML =

        

Producing caption…

;

 

      strive {

        // Run classification and captioning in parallel

        const [classResults, captionResults] = await Promise.all([

          classifier(dataUrl, { top_k: 4 }),

          captioner(dataUrl, { max_new_tokens: 60 })

        ]);

 

        // Render classification labels

        doc.getElementById(‘cls-content’).innerHTML =

          classResults.map(({ label, rating }) => `

            <div class=“label-item”>

              <span>${label}</span>

              <span class=“label-score”>${(rating * 100).toFixed(1)}%</span>

            </div>`).be a part of();

 

        // Render generated caption

        doc.getElementById(‘cap-content’).innerHTML =

          `<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

 

        globalStatus.textContent = ‘Evaluation full.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // File add handler for photos

    const imgDrop  = doc.getElementById(‘img-drop’);

    const imgInput = doc.getElementById(‘img-input’);

    const imgPrev  = doc.getElementById(‘img-preview’);

 

    operate handleImageFile(file) {

      if (!file?.kind.startsWith(‘picture/’)) return;

      const reader = new FileReader();

      reader.onload = e => {

        imgPrev.src = e.goal.end result;

        imgPrev.type.show = ‘block’;

        analyzeImage(e.goal.end result);

      };

      reader.readAsDataURL(file);

    }

 

    imgDrop.addEventListener(‘click on’, () => imgInput.click on());

    imgInput.addEventListener(‘change’, e => handleImageFile(e.goal.information[0]));

    imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

    imgDrop.addEventListener(‘drop’, e => {

      e.preventDefault();

      handleImageFile(e.dataTransfer.information[0]);

    });

 

    // ── Audio decoding helper ─────────────────────────────────────────────

    async operate decodeAudio(arrayBuffer) {

      const audioCtx    = new AudioContext({ sampleRate: 16000 });

      const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

      return audioBuffer.getChannelData(0);  // Mono Float32Array at 16kHz

    }

 

    // ── Speech transcription ──────────────────────────────────────────────

    async operate runTranscription(audioData) {

      // Present speech end result card, disguise picture playing cards

      doc.getElementById(‘card-cls’).type.show = ‘none’;

      doc.getElementById(‘card-cap’).type.show = ‘none’;

      doc.getElementById(‘card-asr’).type.show = ‘block’;

      resultsGrid.type.show = ‘grid’;

 

      doc.getElementById(‘asr-content’).innerHTML =

        

Transcribing…

;

 

      globalStatus.textContent = ‘Working Whisper transcription…’;

 

      strive {

        const end result = await transcriber(audioData, {

          chunk_length_s: 30,

          stride_length_s: 5

        });

        doc.getElementById(‘asr-content’).innerHTML =

          `<p class=“transcript-body”>${end result.textual content.trim()}</p>`;

        globalStatus.textContent = ‘Transcription full.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // ── Microphone recording ──────────────────────────────────────────────

    let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

    let secs = 0;

 

    operate drawWave() {

      const buf = new Uint8Array(analyserNode.frequencyBinCount);

      analyserNode.getByteTimeDomainData(buf);

      waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.peak);

      waveCtx.beginPath();

      waveCtx.strokeStyle = ‘#2563eb’;

      waveCtx.lineWidth = 1.5;

      buf.forEach((v, i) => {

        const x = (i / buf.size) * waveCanvas.width;

        const y = (v / 128.0) * (waveCanvas.peak / 2);

        i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

      });

      waveCtx.stroke();

      animId = requestAnimationFrame(drawWave);

    }

 

    recBtn.addEventListener(‘click on’, async () => {

      if (mediaRecorder?.state === ‘recording’) {

        mediaRecorder.cease();

        recBtn.classList.take away(‘recording’);

        recBtn.textContent = ‘🎙’;

        clearInterval(timerInterval);

        cancelAnimationFrame(animId);

        waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.peak);

        recHint.textContent = ‘Processing…’;

      } else {

        strive {

          const stream  = await navigator.mediaDevices.getUserMedia({ audio: true });

          const actx    = new AudioContext();

          analyserNode  = actx.createAnalyser();

          actx.createMediaStreamSource(stream).join(analyserNode);

          analyserNode.fftSize = 256;

 

          mediaRecorder = new MediaRecorder(stream);

          audioChunks   = [];

          mediaRecorder.ondataavailable = e => e.information.dimension && audioChunks.push(e.information);

          mediaRecorder.onstop = async () => {

            const blob        = new Blob(audioChunks, { kind: ‘audio/webm’ });

            const arrayBuffer = await blob.arrayBuffer();

            const audioData   = await decodeAudio(arrayBuffer);

            stream.getTracks().forEach(t => t.cease());

            await runTranscription(audioData);

            recHint.textContent = ‘Click on to file once more.’;

          };

 

          mediaRecorder.begin();

          recBtn.classList.add(‘recording’);

          recBtn.textContent = ‘⏹’;

          secs = 0;

          recTimer.textContent = ‘0:00’;

          timerInterval = setInterval(() => {

            secs++;

            recTimer.textContent =

              `${Math.flooring(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

          }, 1000);

          recHint.textContent = ‘Recording… click on to cease.’;

          drawWave();

        } catch (err) {

          recHint.textContent = `Mic error: ${err.message}`;

        }

      }

    });

 

    // ── Tab switching ─────────────────────────────────────────────────────

    doc.querySelectorAll(‘.tab’).forEach(tab => {

      tab.addEventListener(‘click on’, () => {

        doc.querySelectorAll(‘.tab, .panel’).forEach(el =>

          el.classList.take away(‘lively’));

        tab.classList.add(‘lively’);

        doc.getElementById(`panel${tab.dataset.tab}`).classList.add(‘lively’);

      });

    });

Related Articles

LEAVE A REPLY

Please enter your comment!
Please enter your name here

[td_block_social_counter facebook="tagdiv" twitter="tagdivofficial" youtube="tagdiv" style="style8 td-social-boxed td-social-font-icons" tdc_css="eyJhbGwiOnsibWFyZ2luLWJvdHRvbSI6IjM4IiwiZGlzcGxheSI6IiJ9LCJwb3J0cmFpdCI6eyJtYXJnaW4tYm90dG9tIjoiMzAiLCJkaXNwbGF5IjoiIn0sInBvcnRyYWl0X21heF93aWR0aCI6MTAxOCwicG9ydHJhaXRfbWluX3dpZHRoIjo3Njh9" custom_title="Stay Connected" block_template_id="td_block_template_8" f_header_font_family="712" f_header_font_transform="uppercase" f_header_font_weight="500" f_header_font_size="17" border_color="#dd3333"]
- Advertisement -spot_img

Latest Articles