<title>Multimodal Media Analyzer</title>
* { field–sizing: border–field; margin: 0; padding: 0; }
physique {
font–household: system–ui, sans–serif;
max–width: 820px;
margin: 0 auto;
padding: 1.5rem 1rem;
background: #f1f5f9;
colour: #1e293b;
}
header { margin–backside: 1.5rem; }
header h1 { font–dimension: 1.5rem; }
header p { colour: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }
/* Mannequin standing indicators */
.mannequin–standing–bar {
show: flex;
hole: 0.5rem;
flex–wrap: wrap;
margin–high: 0.75rem;
}
.mannequin–badge {
font–dimension: 0.78rem;
padding: 0.2rem 0.6rem;
border–radius: 12px;
background: #fef3c7;
colour: #92400e;
}
.mannequin–badge.prepared { background: #dcfce7; colour: #15803d; }
/* Tab bar */
.tabs {
show: flex;
background: white;
border–radius: 8px;
padding: 0.25rem;
hole: 0.25rem;
margin–backside: 1.25rem;
border: 1px stable #e2e8f0;
}
.tab {
flex: 1;
padding: 0.5rem;
textual content–align: heart;
border–radius: 6px;
cursor: pointer;
font–dimension: 0.9rem;
colour: #64748b;
transition: all 0.15s;
}
.tab.lively { background: #2563eb; colour: white; font-weight: 600; }
/* Enter panels */
.panel { show: none; }
.panel.lively { show: block; }
.add–space {
background: white;
border: 2px dashed #cbd5e1;
border–radius: 8px;
padding: 2rem;
textual content–align: heart;
cursor: pointer;
}
.add–space enter { show: none; }
#img-preview {
margin–high: 1rem;
max–width: 100%;
max–peak: 320px;
border–radius: 8px;
show: none;
object–match: cowl;
}
.mic–heart { textual content–align: heart; padding: 1rem 0; }
#rec-btn {
width: 72px; peak: 72px;
border–radius: 50%; border: none;
background: #dc2626; colour: white;
font–dimension: 1.6rem; cursor: pointer;
show: flex; align–objects: heart; justify–content material: heart;
margin: 0 auto 0.5rem;
}
#rec-btn.recording { background: #374151; }
#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }
#rec-timer { font-weight: 600; colour: #374151; margin-bottom: 0.25rem; }
#rec-hint { font-size: 0.85rem; colour: #64748b; }
#wave-canvas { show: block; margin: 0.5rem auto; border-radius: 4px; }
/* Outcomes grid */
.outcomes–grid {
show: grid;
grid–template–columns: repeat(auto–match, minmax(220px, 1fr));
hole: 1rem;
margin–high: 1.25rem;
}
.end result–card {
background: white;
border: 1px stable #e2e8f0;
border–radius: 8px;
padding: 1rem;
}
.end result–card h3 {
font–dimension: 0.75rem;
textual content–rework: uppercase;
letter–spacing: 0.06em;
colour: #64748b;
margin–backside: 0.6rem;
}
.label–merchandise {
show: flex;
justify–content material: house–between;
align–objects: heart;
padding: 0.25rem 0;
font–dimension: 0.875rem;
border–backside: 1px stable #f1f5f9;
}
.label–rating {
font–dimension: 0.8rem;
colour: #64748b;
background: #f1f5f9;
padding: 0.1rem 0.4rem;
border–radius: 4px;
}
.caption–physique {
font–dimension: 0.95rem;
line–peak: 1.5;
font–type: italic;
colour: #334155;
}
.transcript–physique {
font–dimension: 0.95rem;
line–peak: 1.6;
colour: #334155;
white–house: pre–wrap;
}
.placeholder–textual content { colour: #94a3b8; font-style: italic; font-size: 0.9rem; }
#global-status {
font–dimension: 0.85rem;
colour: #64748b;
margin–backside: 1rem;
}
@media (max–width: 500px) {
.outcomes–grid { grid–template–columns: 1fr; }
}
<header>
<h1>Multimodal Media Analyzer</h1>
<p>Picture classification, captioning, and speech transcription — all in your browser.</p>
<div class=“model-status-bar”>
<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>
<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>
<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>
</div>
</header>
<div id=“global-status”>Loading fashions in parallel — first run downloads ~400 MB whole.</div>
<div class=“tabs”>
<div class=“tab lively” information–tab=“picture”>🖼 Picture Evaluation</div>
<div class=“tab” information–tab=“speech”>🎙 Speech Transcription</div>
</div>
<!— Picture panel —>
<div class=“panel lively” id=“panel-image”>
<div class=“upload-area” id=“img-drop”>
<p>Click on or drag an picture to analyze</p>
<p type=“font-size:0.8rem;colour:#94a3b8;margin-top:0.3rem”>
JPG, PNG, WebP, GIF supported
</p>
</div>
<img id=“img-preview” alt=“Preview” />
</div>
<!— Speech panel —>
<div class=“panel” id=“panel-speech”>
<div class=“mic-center”>
<button id=“rec-btn” disabled>🎙</button>
<div id=“rec-timer”>0:00</div>
<div id=“rec-hint”>Ready for Whisper mannequin...</div>
</div>
</div>
<!— Outcomes – proven for each modes —>
<div class=“results-grid” id=“results-grid” type=“show:none”>
<!— Picture outcomes (proven in picture mode) —>
<div class=“result-card” id=“card-cls” type=“show:none”>
<h3>Classification</h3>
<div id=“cls-content”>
<p class=“placeholder-text”>No outcomes but.</p>
</div>
</div>
<div class=“result-card” id=“card-cap” type=“show:none”>
<h3>Caption</h3>
<div id=“cap-content”>
<p class=“placeholder-text”>No caption but.</p>
</div>
</div>
<!— Speech outcomes (proven in speech mode) —>
<div class=“result-card” id=“card-asr” type=“show:none”>
<h3>Transcription</h3>
<div id=“asr-content”>
<p class=“placeholder-text”>File audio to see the transcription.</p>
</div>
</div>
</div>
import { pipeline }
from ‘https://cdn.jsdelivr.web/npm/@huggingface/transformers@3.0.2’;
// ── Pipeline references ───────────────────────────────────────────────
let classifier, captioner, transcriber;
let readyCount = 0;
// Replace a mannequin badge to “prepared” state
operate markReady(badgeId, label) {
const badge = doc.getElementById(badgeId);
badge.textContent = `${label}: prepared`;
badge.classList.add(‘prepared’);
readyCount++;
if (readyCount === 3) {
globalStatus.textContent =
‘All fashions prepared. Add a picture or file audio.’;
recBtn.disabled = false;
recHint.textContent = ‘Click on to begin recording.’;
}
}
// Load all three pipelines concurrently
Promise.all([
pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)
}),
pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)
}),
pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {
dtype: ‘q8’,
progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)
})
]).then(([cls, cap, asr]) => {
classifier = cls;
captioner = cap;
transcriber = asr;
}).catch(err => {
globalStatus.textContent = `Error loading fashions: ${err.message}`;
});
// ── UI references ─────────────────────────────────────────────────────
const globalStatus = doc.getElementById(‘global-status’);
const resultsGrid = doc.getElementById(‘results-grid’);
const recBtn = doc.getElementById(‘rec-btn’);
const recHint = doc.getElementById(‘rec-hint’);
const recTimer = doc.getElementById(‘rec-timer’);
const waveCanvas = doc.getElementById(‘wave-canvas’);
const waveCtx = waveCanvas.getContext(‘second’);
// ── Picture evaluation ────────────────────────────────────────────────────
async operate analyzeImage(dataUrl) {
if (!classifier || !captioner) {
globalStatus.textContent = ‘Fashions nonetheless loading. Please wait.’;
return;
}
globalStatus.textContent = ‘Working classification and captioning…’;
// Present picture end result playing cards, disguise speech card
doc.getElementById(‘card-cls’).type.show = ‘block’;
doc.getElementById(‘card-cap’).type.show = ‘block’;
doc.getElementById(‘card-asr’).type.show = ‘none’;
resultsGrid.type.show = ‘grid’;
doc.getElementById(‘cls-content’).innerHTML =
Classifying…
‘
;doc.getElementById(‘cap-content’).innerHTML =
Producing caption…
‘
;
strive {
// Run classification and captioning in parallel
const [classResults, captionResults] = await Promise.all([
classifier(dataUrl, { top_k: 4 }),
captioner(dataUrl, { max_new_tokens: 60 })
]);
// Render classification labels
doc.getElementById(‘cls-content’).innerHTML =
classResults.map(({ label, rating }) => `
<div class=“label-item”>
<span>${label}</span>
<span class=“label-score”>${(rating * 100).toFixed(1)}%</span>
</div>`).be a part of(”);
// Render generated caption
doc.getElementById(‘cap-content’).innerHTML =
`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;
globalStatus.textContent = ‘Evaluation full.’;
} catch (err) {
globalStatus.textContent = `Error: ${err.message}`;
}
}
// File add handler for photos
const imgDrop = doc.getElementById(‘img-drop’);
const imgInput = doc.getElementById(‘img-input’);
const imgPrev = doc.getElementById(‘img-preview’);
operate handleImageFile(file) {
if (!file?.kind.startsWith(‘picture/’)) return;
const reader = new FileReader();
reader.onload = e => {
imgPrev.src = e.goal.end result;
imgPrev.type.show = ‘block’;
analyzeImage(e.goal.end result);
};
reader.readAsDataURL(file);
}
imgDrop.addEventListener(‘click on’, () => imgInput.click on());
imgInput.addEventListener(‘change’, e => handleImageFile(e.goal.information[0]));
imgDrop.addEventListener(‘dragover’, e => e.preventDefault());
imgDrop.addEventListener(‘drop’, e => {
e.preventDefault();
handleImageFile(e.dataTransfer.information[0]);
});
// ── Audio decoding helper ─────────────────────────────────────────────
async operate decodeAudio(arrayBuffer) {
const audioCtx = new AudioContext({ sampleRate: 16000 });
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz
}
// ── Speech transcription ──────────────────────────────────────────────
async operate runTranscription(audioData) {
// Present speech end result card, disguise picture playing cards
doc.getElementById(‘card-cls’).type.show = ‘none’;
doc.getElementById(‘card-cap’).type.show = ‘none’;
doc.getElementById(‘card-asr’).type.show = ‘block’;
resultsGrid.type.show = ‘grid’;
doc.getElementById(‘asr-content’).innerHTML =
Transcribing…
‘
;
globalStatus.textContent = ‘Working Whisper transcription…’;
strive {
const end result = await transcriber(audioData, {
chunk_length_s: 30,
stride_length_s: 5
});
doc.getElementById(‘asr-content’).innerHTML =
`<p class=“transcript-body”>${end result.textual content.trim()}</p>`;
globalStatus.textContent = ‘Transcription full.’;
} catch (err) {
globalStatus.textContent = `Error: ${err.message}`;
}
}
// ── Microphone recording ──────────────────────────────────────────────
let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;
let secs = 0;
operate drawWave() {
const buf = new Uint8Array(analyserNode.frequencyBinCount);
analyserNode.getByteTimeDomainData(buf);
waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.peak);
waveCtx.beginPath();
waveCtx.strokeStyle = ‘#2563eb’;
waveCtx.lineWidth = 1.5;
buf.forEach((v, i) => {
const x = (i / buf.size) * waveCanvas.width;
const y = (v / 128.0) * (waveCanvas.peak / 2);
i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);
});
waveCtx.stroke();
animId = requestAnimationFrame(drawWave);
}
recBtn.addEventListener(‘click on’, async () => {
if (mediaRecorder?.state === ‘recording’) {
mediaRecorder.cease();
recBtn.classList.take away(‘recording’);
recBtn.textContent = ‘🎙’;
clearInterval(timerInterval);
cancelAnimationFrame(animId);
waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.peak);
recHint.textContent = ‘Processing…’;
} else {
strive {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const actx = new AudioContext();
analyserNode = actx.createAnalyser();
actx.createMediaStreamSource(stream).join(analyserNode);
analyserNode.fftSize = 256;
mediaRecorder = new MediaRecorder(stream);
audioChunks = [];
mediaRecorder.ondataavailable = e => e.information.dimension && audioChunks.push(e.information);
mediaRecorder.onstop = async () => {
const blob = new Blob(audioChunks, { kind: ‘audio/webm’ });
const arrayBuffer = await blob.arrayBuffer();
const audioData = await decodeAudio(arrayBuffer);
stream.getTracks().forEach(t => t.cease());
await runTranscription(audioData);
recHint.textContent = ‘Click on to file once more.’;
};
mediaRecorder.begin();
recBtn.classList.add(‘recording’);
recBtn.textContent = ‘⏹’;
secs = 0;
recTimer.textContent = ‘0:00’;
timerInterval = setInterval(() => {
secs++;
recTimer.textContent =
`${Math.flooring(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;
}, 1000);
recHint.textContent = ‘Recording… click on to cease.’;
drawWave();
} catch (err) {
recHint.textContent = `Mic error: ${err.message}`;
}
}
});
// ── Tab switching ─────────────────────────────────────────────────────
doc.querySelectorAll(‘.tab’).forEach(tab => {
tab.addEventListener(‘click on’, () => {
doc.querySelectorAll(‘.tab, .panel’).forEach(el =>
el.classList.take away(‘lively’));
tab.classList.add(‘lively’);
doc.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘lively’);
});
});
