International College of Digital Innovation, CMU
October 10, 2025
(async () => {
// ========= Layout & Styles =========
const box = html`<div style="max-width:960px;font:14px system-ui,-apple-system,Segoe UI,Roboto,sans-serif;color:#0f172a;">
<style>
.wrap{display:grid;grid-template-columns:320px 1fr;gap:14px}
.side{border:1px solid #cbd5e1;border-radius:12px;padding:12px;background:#f8fafc}
.main{display:grid;gap:12px}
.h{font-weight:700;margin:4px 0 6px}
.group{border:1px dashed #cbd5e1;border-radius:10px;padding:10px;background:#fff;margin-bottom:10px}
.group>.title{font-weight:700;margin-bottom:6px}
.kpi{display:grid;grid-template-columns:repeat(3,minmax(0,1fr));gap:8px}
.k{border:1px solid #cbd5e1;border-radius:12px;background:#fff;padding:10px}
.k b{display:block;font-size:18px;margin-bottom:4px}
.tbl{border-collapse:collapse;width:100%}
.tbl th,.tbl td{border:1px solid #cbd5e1;padding:6px 8px;text-align:left;vertical-align:top}
.small{font-size:12px}
.mono{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.pill{display:inline-block;padding:8px 12px;border-radius:999px;border:1px solid #cbd5e1;background:#fff;cursor:pointer;user-select:none;position:relative;overflow:hidden;transition:transform .08s ease, box-shadow .2s ease}
.pill:active{transform:scale(0.98)}
.ripple{position:absolute;border-radius:999px;transform:scale(0);opacity:.35;pointer-events:none;background:#0ea5e9;animation:ripple .5s ease-out forwards}
@keyframes ripple{to{transform:scale(8);opacity:0}}
.hint{color:#475569;font-size:12px}
</style>
<div class="wrap">
<div class="side">
<div class="h">1) Data — Preset only</div>
<div class="group">
<div class="title">Preset corpus</div>
<div id="ctrl"></div>
<div class="hint" style="margin-top:6px">เลือกชุดข้อมูลเพื่อพรีวิวด้านขวา (ไม่มีการแก้ไข/เพิ่มข้อมูล)</div>
</div>
<div class="group">
<div class="title">Actions</div>
<div class="row" id="actions"></div>
<div class="hint" style="margin-top:6px">Copy รายการเอกสารทั้งหมด หรือสุ่มสลับลำดับเพื่อดูตัวอย่างหลากหลาย</div>
</div>
</div>
<div class="main">
<div class="h">Step 1 — Data (Preview)</div>
<div id="kpi" class="kpi"></div>
<div id="view"></div>
</div>
</div>
</div>`;
// ---------- Ripple for buttons ----------
function makeRipple(btn, ev){
const rect = btn.getBoundingClientRect();
const r = Math.max(rect.width, rect.height);
const span = html`<span class="ripple"></span>`;
span.style.width = span.style.height = r + "px";
span.style.left = (ev.clientX - rect.left - r/2) + "px";
span.style.top = (ev.clientY - rect.top - r/2) + "px";
btn.appendChild(span); setTimeout(()=> span.remove(), 600);
}
box.addEventListener("click", e=>{
const b = e.target.closest(".pill");
if(!b || !box.contains(b)) return;
makeRipple(b, e);
});
// ---------- Expanded Preset CORPUS ----------
const CORPUS = {
"Mini Reviews":[
"I absolutely loved the movie — funny, charming, and moving.",
"The plot was predictable, but the soundtrack was excellent.",
"Unfunny jokes and slow pacing made it boring.",
"A touching finale that left me smiling.",
"Average overall; strong visuals but uneven writing.",
"Battery lasts two days; camera is sharp but performance stutters.",
"Disappointing — the screen cracked and support was unhelpful.",
"Great build quality and the speakers are loud.",
"My account was charged twice this month; please refund.",
"The app crashes whenever I upload photos."
],
"Support Emails":[
"I can't reset my password after the update.",
"How can I change the email associated with my account?",
"The invoice download button is missing.",
"Please help—I'm charged twice for the same order.",
"Upload crashes on iOS 17 when selecting multiple photos.",
"Shipping address change request for order #4312.",
"Refund was approved but I didn't receive the money.",
"Two-factor code never arrives to my phone.",
"Attachment upload stalls at 95 percent.",
"Pricing page shows old plan tiers."
],
"Mixed Topics":[
"The striker scored a late winner in extra time.",
"Parliament passed the new budget after a long debate.",
"Researchers unveiled a prototype quantum processor.",
"The coach announced the final squad for the cup.",
"The minister discussed trade policy with reporters.",
"The company released a major update to its OS.",
"Team signed a veteran defender on a two-year deal.",
"Senate will vote on the climate package next week.",
"Startups race to build faster AI accelerators.",
"Forward missed training due to a minor injury."
],
"App Store Reviews":[
"The latest update fixed my login bug—thanks!",
"Push notifications are delayed by several hours.",
"UI feels smoother and the dark mode is perfect.",
"Crashes on startup on my tablet after installing.",
"Premium features are worth the price in my opinion.",
"Sync between devices is inconsistent and slow.",
"Love the new widgets on the home screen.",
"Why is the export PDF option removed?",
"Great app overall but search is weak.",
"Customer support replied within minutes."
],
"Hotel Reviews":[
"Room was spotless and the staff were incredibly friendly.",
"Air conditioning was loud and kept me awake.",
"Breakfast buffet had many fresh options.",
"Elevators were slow during peak hours.",
"Fantastic location near the central station.",
"Shower pressure was low and water wasn’t hot.",
"Concierge arranged a great city tour for us.",
"Wifi kept disconnecting on the 8th floor.",
"Beds were comfy and blackout curtains worked well.",
"Check-in took too long with only one clerk."
],
"Food Delivery Tickets":[
"Order arrived cold and missing the side dish.",
"Delivery was fast; everything was still hot.",
"Rider couldn’t find my address and called twice.",
"Sauce spilled inside the bag—please pack better.",
"Wrong drink size sent with the combo.",
"Great packaging and the fries stayed crisp.",
"Late by 30 minutes beyond the ETA.",
"Item substituted without asking; I want a refund.",
"Coupon code applied successfully—nice discount!",
"Contactless delivery worked as expected."
],
"Tech News Headlines":[
"Chipmaker announces 3nm roadmap and new AI accelerator.",
"Open-source model beats prior benchmarks on translation.",
"Cyberattack disrupts cloud provider’s European region.",
"Startup raises Series B to build spatial computing tools.",
"Quantum error correction shows promising results in lab.",
"Major browser ships passkeys by default for users.",
"Researchers publish dataset for multimodal reasoning.",
"Datacenter cooling breakthrough reduces power usage.",
"New programming language gains traction among ML engineers.",
"Robotics firm unveils warehouse automation platform."
],
"E-commerce Product Q&A":[
"Does this laptop support dual monitors via USB-C?",
"What is the return policy for refurbished phones?",
"Is the watch band compatible with Series 7?",
"How long does the battery last with GPS enabled?",
"Can I wash the cover in a machine?",
"Does it come with international warranty?",
"Is the keyboard backlit and spill-resistant?",
"What’s the max RAM upgrade supported?",
"Is there a student discount on this bundle?",
"Does it include a carrying case?"
],
"Social Posts (Short)":[
"Finally hit my 10k steps today 🎉",
"Anyone else getting app crashes after the update?",
"That soundtrack is still stuck in my head!",
"Coffee first, then emails ☕️",
"PSA: roadworks on 3rd Ave—expect delays.",
"Can’t believe the season finale ended like that!",
"Sunset at the beach was unreal.",
"Just adopted a puppy—name ideas?",
"VPN speeds are super slow tonight.",
"Back to the gym after a month off!"
],
"Programming Forums":[
"How to vectorize this loop in NumPy without broadcasting errors?",
"Best way to debounce async input in React?",
"Why is my Docker image so large after multi-stage build?",
"gRPC vs REST for internal microservices—tradeoffs?",
"PostgreSQL index not used with LIKE query—tips?",
"Kubernetes liveness probe failing on startup—workarounds?",
"Python multiprocessing hangs on Mac—any fixes?",
"Memory leak in Rust with Arc and Mutex?",
"How to speed up TF-IDF on millions of documents?",
"CI pipeline caching not working for node_modules."
]
};
// ---------- Controls ----------
const ctrl = box.querySelector("#ctrl");
const actions = box.querySelector("#actions");
const kpi = box.querySelector("#kpi");
const view = box.querySelector("#view");
const sel = Inputs.select(Object.keys(CORPUS), {label:"Choose preset"});
const btnShuffle = html`<button class="pill">Shuffle order</button>`;
const btnCopy = html`<button class="pill">Copy all to clipboard</button>`;
ctrl.append(sel);
actions.append(btnShuffle, btnCopy);
function rand(seed){ let s=seed>>>0; return ()=> (s = (s*1664525 + 1013904223)>>>0) / 0xffffffff; }
function getDocs(){ return CORPUS[sel.value].slice(); }
// ---------- Stats ----------
function stats(docs){
const n = docs.length;
const lens = docs.map(d=> d.split(/\s+/).filter(Boolean).length);
const avg = n? (lens.reduce((a,b)=>a+b,0)/n) : 0;
const chars = docs.reduce((a,b)=> a + b.length, 0);
return {n, avg, chars};
}
// ---------- Render ----------
function render(){
const docs = getDocs();
const s = stats(docs);
// KPI
kpi.innerHTML = "";
const card = (t,v,sub="")=> html`<div class="k"><b>${v}</b><div>${t}</div>${sub?html`<div class="hint">${sub}</div>`:""}</div>`;
kpi.append(
card("Documents", s.n),
card("Avg length (tokens)", s.avg.toFixed(1)),
card("Total characters", s.chars)
);
// Table
const tbl = html`<table class="tbl small"></table>`;
tbl.append(html`<thead><tr><th>#</th><th>Text</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
docs.forEach((t,i)=> tb.append(html`<tr><td>${i+1}</td><td>${t}</td></tr>`));
tbl.append(tb);
view.innerHTML = "";
view.append(tbl);
}
// ---------- Actions ----------
btnShuffle.onclick = ()=>{
const docs = getDocs();
const r = rand(42);
const shuffled = docs.map((d,i)=>[d,r()]).sort((a,b)=>a[1]-b[1]).map(x=>x[0]);
CORPUS[sel.value] = shuffled; // replace order only for current session
render();
};
btnCopy.onclick = async ()=>{
const docs = getDocs();
const text = docs.join("\n");
try{
await navigator.clipboard.writeText(text);
btnCopy.textContent = "Copied!";
setTimeout(()=> btnCopy.textContent = "Copy all to clipboard", 1200);
}catch(err){
btnCopy.textContent = "Copy failed";
setTimeout(()=> btnCopy.textContent = "Copy all to clipboard", 1200);
}
};
sel.addEventListener("input", render);
// Initial
render();
return box;
})();(async () => {
// ============== Layout & Styles ==============
const box = html`<div style="max-width:1060px;font:14px system-ui,-apple-system,Segoe UI,Roboto,sans-serif;color:#0f172a">
<style>
.wrap{display:grid;grid-template-columns:340px 1fr;gap:14px;align-items:start}
.side{border:1px solid #cbd5e1;border-radius:12px;padding:12px;background:#f8fafc}
.main{display:grid;gap:8px}
.h{font-weight:700;margin:2px 0 6px}
.group{border:1px dashed #cbd5e1;border-radius:10px;background:#fff;padding:10px;margin-bottom:10px}
.group>.title{font-weight:700;margin-bottom:6px}
.pill, .tabs-head button{
display:inline-block;padding:8px 12px;border-radius:999px;border:1px solid #cbd5e1;background:#fff;cursor:pointer;
position:relative;overflow:hidden;transition:transform .08s ease, box-shadow .2s ease, background .15s ease;
user-select:none
}
.pill:active, .tabs-head button:active{ transform:scale(0.98) }
.btn-raised:hover{ box-shadow:0 2px 10px rgba(2,6,23,.08) }
/* Ripple */
.ripple{
position:absolute;border-radius:999px;transform:scale(0);opacity:.35;pointer-events:none;background:#0284c7;
animation:ripple .5s ease-out forwards;
}
@keyframes ripple{ to{ transform:scale(8); opacity:0; } }
.hint{color:#475569;font-size:12px}
.mono{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.call{border:1px solid #cbd5e1;border-left:4px solid #0369a1;background:#f8fafc;border-radius:10px;padding:10px}
.tokens{line-height:1.9}
.tok{display:inline-block;margin:2px 3px;padding:2px 6px;border-radius:8px;border:1px solid #e2e8f0}
.tbl{border-collapse:collapse;width:100%}
.tbl th,.tbl td{border:1px solid #cbd5e1;padding:6px 8px;text-align:left;vertical-align:top}
.tbl th{background:#f1f5f9}
.chg{background:#fff7ed} /* highlight changed */
textarea.ta{width:100%;min-height:120px;border:1px solid #cbd5e1;border-radius:8px;padding:8px;font:13px/1.4 ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.row{display:flex;gap:8px;flex-wrap:wrap;align-items:flex-end}
.col{min-width:210px}
.tabs{border:1px solid #cbd5e1;border-radius:12px;background:#fff}
.tabs-head{display:flex;gap:6px;padding:6px;border-bottom:1px solid #e2e8f0}
.tabs-head button.active{color:#fff;border-color:transparent;background:#0369a1}
.tabs-body{padding:10px}
.hl{background:#fff7ed;border:1px solid #fed7aa;border-radius:8px;padding:8px}
.box3{display:grid;grid-template-columns:repeat(3, minmax(0,1fr)); gap:8px}
@media (max-width:900px){ .wrap{grid-template-columns:1fr} .box3{grid-template-columns:1fr} }
.badge{display:inline-block;padding:2px 8px;border-radius:999px;background:#e2e8f0;font-size:12px}
select.sel{width:100%;padding:8px 10px;border:1px solid #cbd5e1;border-radius:10px;background:#fff}
.muted{color:#64748b}
</style>
<div class="wrap">
<div class="side">
<div class="h">Tokenization Controls</div>
<div class="group">
<div class="title">1) Input Text</div>
<div id="ctrl-text"></div>
</div>
<div class="group">
<div class="title">Quick Examples (Dropdown)</div>
<div id="ctrl-ex"></div>
<div class="hint" style="margin-top:6px">เลือกตัวอย่างที่โชว์ผลต่าง None / Stemming / Lemmatization ชัดเจน</div>
</div>
<div class="group">
<div class="title">2) Preprocessing</div>
<div id="ctrl-pre"></div>
</div>
<div class="group">
<div class="title">3) Normalization</div>
<div id="ctrl-norm"></div>
</div>
<div class="group">
<div class="title">4) N-grams</div>
<div id="ctrl-ng"></div>
</div>
</div>
<div class="main">
<div class="h">Interactive Tokenization Demo</div>
<div class="tabs">
<div class="tabs-head" id="tabs-h"></div>
<div class="tabs-body" id="tabs-b"></div>
</div>
<div class="call">
<b>Pipeline Summary</b>
<div id="pipe" class="mono" style="margin-top:6px"></div>
</div>
<div id="freq"></div>
</div>
</div>
</div>`;
// ---- Mount targets
const ctrlText = box.querySelector("#ctrl-text");
const ctrlPre = box.querySelector("#ctrl-pre");
const ctrlNorm = box.querySelector("#ctrl-norm");
const ctrlNg = box.querySelector("#ctrl-ng");
const ctrlEx = box.querySelector("#ctrl-ex");
const tabsH = box.querySelector("#tabs-h");
const tabsB = box.querySelector("#tabs-b");
const pipeEl = box.querySelector("#pipe");
const freqEl = box.querySelector("#freq");
// ============== Ripple effect for ALL buttons ==============
function makeRipple(btn, ev){
const rect = btn.getBoundingClientRect();
const r = Math.max(rect.width, rect.height);
const s = html`<span class="ripple"></span>`;
const x = (ev ? ev.clientX - rect.left : rect.width/2) - r/2;
const y = (ev ? ev.clientY - rect.top : rect.height/2) - r/2;
s.style.width = s.style.height = r + "px";
s.style.left = x + "px";
s.style.top = y + "px";
btn.appendChild(s);
setTimeout(()=> s.remove(), 600);
}
box.addEventListener("click", e=>{
const b = e.target.closest("button");
if(!b || !box.contains(b)) return;
makeRipple(b, e);
});
// ============== Stopwords & Normalizers ==============
const EN_STOP = new Set([
"a","an","and","are","as","at","be","but","by","for","if","in","into","is","it","its",
"of","on","or","such","t","that","the","their","then","there","these","they","this",
"to","was","will","with","you","your","i","me","my","we","our","he","she","them","his","her"
]);
function stem(w){
if(w.length<=3) return w;
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/s$/.test(w) && !/ss$/.test(w)) w = w.replace(/s$/,"");
if(/eed$/.test(w)) return w.replace(/eed$/,"ee");
if(/(ed|ing)$/.test(w)){
let base = w.replace(/(ed|ing)$/,"");
if(/(at|bl|iz)$/.test(base)) return base + "e";
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) return base.slice(0,-1);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
const LEMMA_EX = new Map(Object.entries({
better:"good", best:"good", worse:"bad", worst:"bad",
did:"do", does:"do", done:"do", doing:"do",
has:"have", had:"have",
engaging:"engage", making:"make", running:"run", studied:"study",
children:"child", mice:"mouse", people:"person", batteries:"battery", stories:"story"
}));
function undoubleLast(s){ return s.replace(/(bb|dd|ff|gg|ll|mm|nn|pp|rr|tt)$/,"$1".slice(0,1)); }
function addSilentE(s){ return s + "e"; }
function lemma(w){
if(w.length<=2) return w;
if(LEMMA_EX.has(w)) return LEMMA_EX.get(w);
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/xes$|zes$|ches$|shes$/.test(w)) return w.replace(/es$/,"");
if(/s$/.test(w) && !/ss$/.test(w) && !/(us|is)$/.test(w)) return w.replace(/s$/,"");
if(/ied$/.test(w) && w.length>4) return w.replace(/ied$/,"y");
if(/ed$/.test(w) && w.length>3){
let base = w.replace(/ed$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ing$/.test(w) && w.length>4){
let base = w.replace(/ing$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
// ============== Controls ==============
const ta = html`<textarea class="ta" placeholder="Type some text to tokenize...">The children were running quickly; the mice were studied in better-designed studies. He does what he did and has done it before; people said the best results were engaging.</textarea>`;
ctrlText.append(ta);
const ckLower = Inputs.toggle({label:"Lowercase", value:true});
const ckPunct = Inputs.toggle({label:"Remove punctuation", value:true});
const ckStop = Inputs.toggle({label:"Remove English stopwords", value:false});
ckLower.classList.add("btn-raised"); ckPunct.classList.add("btn-raised"); ckStop.classList.add("btn-raised");
ctrlPre.append(ckLower, ckPunct, ckStop);
const normSel = Inputs.radio(["None","Stemming","Lemmatization"], {label:"Normalization", value:"None"});
ctrlNorm.append(normSel);
const ngSel = Inputs.select(["1 (unigram)","1–2 (uni+bi)"], {label:"N-grams", value:"1 (unigram)"});
ctrlNg.append(ngSel);
// ============== Quick Examples (Dropdown) ==============
const EXAMPLES = [
["— pick an example —", ""],
["Comparative + irregulars", "The better solution was the best one yesterday, but worse outcomes were expected today; people agreed."],
["Plural & irregular nouns", "Children and mice are people too; their stories and batteries were studied extensively."],
["Verb tenses", "He does what he did, has done it before, and is doing it again while others were running."],
["-ing/-ed variants", "Running, studying, and studied work were making engaging activities for children and people."],
["Mixed bag", "The children were running quickly; the mice were studied in better-designed studies; the best outcomes were reported."]
];
const exSel = html`<select class="sel"></select>`;
EXAMPLES.forEach(([label,val]) => exSel.append(html`<option value="${val}">${label}</option>`));
exSel.onchange = ()=>{ if(exSel.value) { ta.value = exSel.value; renderBody(); } };
ctrlEx.append(exSel);
// ============== Tokenization Core ==============
function preprocessBaseText(txt){
let t = ckLower.value ? txt.toLowerCase() : txt;
if(ckPunct.value){
t = t.replace(/[\u2013\u2014]/g," ");
t = t.replace(/[^a-zA-Z0-9' ]+/g," ");
}
return t.trim();
}
function tokenizeWith(txt, normMode, includeBigrams){
let t = preprocessBaseText(txt);
let toks = t.split(/\s+/).filter(Boolean);
if(ckStop.value) toks = toks.filter(w=>!EN_STOP.has(w));
if(normMode==="Stemming") toks = toks.map(stem);
if(normMode==="Lemmatization") toks = toks.map(lemma);
if(includeBigrams && ngSel.value==="1–2 (uni+bi)"){
const bi=[]; for(let i=0;i+1<toks.length;i++) bi.push(toks[i]+"_"+toks[i+1]);
toks = toks.concat(bi);
}
return toks;
}
const tokenizeBase = (txt) => tokenizeWith(txt, normSel.value, true);
// ============== Tabs (Preview) ==============
const TABS = [
["Overview","🧭"],
["Tokens","🔤"],
["N-gram Stats","📊"],
["Normalization Compare","🧪"],
["Normalization Mapping","🗺️"]
];
let active = 0;
function renderTabs(){
tabsH.innerHTML = "";
TABS.forEach(([name,icon],i)=>{
const b = html`<button class="${i===active?'active':''}">${icon} ${name}</button>`;
b.onclick = ()=>{ active = i; renderBody(); renderTabs(); };
tabsH.append(b);
});
}
function freqCount(arr){
const m = new Map(); for(const w of arr) m.set(w, (m.get(w)||0)+1);
return Array.from(m.entries()).sort((a,b)=> b[1]-a[1] || (a[0]>b[0]?1:-1));
}
function tokenListEl(tokens, max=40){
const wrap = html`<div class="tokens"></div>`;
tokens.slice(0,max).forEach(t => wrap.append(html`<span class="tok mono">${t}</span>`));
if(tokens.length>max) wrap.append(html`<span class="badge">+${tokens.length-max} more</span>`);
return wrap;
}
function renderBody(){
tabsB.innerHTML = "";
const text = ta.value || "";
const before = preprocessBaseText(text)
.replace(/[^a-zA-Z0-9' ]+/g, ckPunct.value?" ":""); // for display
const baseTokens = tokenizeBase(text);
// Pipeline summary
const pipeline = [
ckLower.value?"lowercase":"(keep case)",
ckPunct.value?"rm_punct":"(keep punct)",
ckStop.value? "rm_stop":"(keep stop)",
normSel.value==="None"?"no_norm": normSel.value.toLowerCase(),
ngSel.value==="1 (unigram)"?"unigram":"unigram+bigram"
].join(" > ");
pipeEl.textContent = pipeline;
if(active===0){
// Overview
const boxO = html`<div></div>`;
boxO.append(html`<div class="h">Overview</div>`);
const beforeEl = html`<div class="hl mono">${before || "(empty)"}</div>`;
const afterEl = html`<div class="hl"></div>`;
afterEl.append(tokenListEl(baseTokens, 40));
boxO.append(html`<div><b>Preprocessed text</b> <span class="muted">(after lowercase/punct/stopwords)</span></div>`, beforeEl);
boxO.append(html`<div style="margin-top:8px"><b>Tokens</b> <span class="hint">(first 40)</span></div>`, afterEl);
tabsB.append(boxO);
}
if(active===1){
// Tokens table
const boxT = html`<div></div>`;
boxT.append(html`<div class="h">Token List</div>`);
const freq = freqCount(baseTokens);
const tbl = html`<table class="tbl"></table>`;
tbl.append(html`<thead><tr><th style="width:60px">#</th><th>Token</th><th style="width:90px">Count</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
freq.slice(0,120).forEach(([w,c],i)=> tb.append(html`<tr><td>${i+1}</td><td class="mono">${w}</td><td>${c}</td></tr>`));
tbl.append(tb);
boxT.append(tbl);
tabsB.append(boxT);
}
if(active===2){
// N-gram stats
const uni = baseTokens.filter(w=>!w.includes("_"));
const bi = baseTokens.filter(w=>w.includes("_"));
const f1 = freqCount(uni), f2 = freqCount(bi);
const boxN = html`<div></div>`;
boxN.append(html`<div class="h">N-gram Statistics</div>`);
const row = html`<div class="row"></div>`;
const t1 = html`<table class="tbl"></table>`;
t1.append(html`<thead><tr><th colspan="3">Unigram (top 20)</th></tr><tr><th>#</th><th>Token</th><th>Count</th></tr></thead>`);
const b1 = html`<tbody></tbody>`;
f1.slice(0,20).forEach(([w,c],i)=> b1.append(html`<tr><td>${i+1}</td><td class="mono">${w}</td><td>${c}</td></tr>`));
t1.append(b1);
const t2 = html`<table class="tbl"></table>`;
t2.append(html`<thead><tr><th colspan="3">Bigram (top 20)</th></tr><tr><th>#</th><th>Token</th><th>Count</th></tr></thead>`);
const b2 = html`<tbody></tbody>`;
f2.slice(0,20).forEach(([w,c],i)=> b2.append(html`<tr><td>${i+1}</td><td class="mono">${w}</td><td>${c}</td></tr>`));
t2.append(b2);
row.append(t1, t2);
boxN.append(row);
tabsB.append(boxN);
}
if(active===3){
// Normalization Compare (token list per mode)
const noneTok = tokenizeWith(text, "None", true);
const stemTok = tokenizeWith(text, "Stemming", true);
const lemmaTok = tokenizeWith(text, "Lemmatization", true);
const boxC = html`<div></div>`;
boxC.append(html`<div class="h">Normalization Compare (3 columns)</div>`);
const grid = html`<div class="box3"></div>`;
const col = (title, arr) => {
const wrap = html`<div class="hl"></div>`;
wrap.append(html`<div style="margin-bottom:6px"><b>${title}</b> <span class="hint">(Total ${arr.length}, Unique ${new Set(arr).size})</span></div>`);
wrap.append(tokenListEl(arr, 60));
return wrap;
};
grid.append(col("None", noneTok), col("Stemming", stemTok), col("Lemmatization", lemmaTok));
boxC.append(grid);
tabsB.append(boxC);
}
if(active===4){
// Normalization Mapping (row-wise mapping: original -> each mode)
// ใช้เฉพาะ unigram เพื่อให้ mapping 1:1 ชัดเจน
const baseNoNormUni = tokenizeWith(text, "None", false).filter(t=>!t.includes("_"));
const seen = new Set();
const orderUni = [];
for(const t of baseNoNormUni){ if(!seen.has(t)){ seen.add(t); orderUni.push(t); } }
const mapRow = (w) => {
return {
orig: w,
none: w, // after preprocess, before norm
stem: stem(w),
lemma: lemma(w)
};
};
const rows = orderUni.map(mapRow);
const tbl = html`<table class="tbl"></table>`;
tbl.append(html`<thead>
<tr>
<th style="width:40px">#</th>
<th>Token (pre-norm)</th>
<th>None</th>
<th>Stemming</th>
<th>Lemmatization</th>
</tr>
</thead>`);
const tb = html`<tbody></tbody>`;
rows.forEach((r,i)=>{
const tr = html`<tr></tr>`;
const cNone = r.none;
const cStem = r.stem;
const cLemma = r.lemma;
const stemChg = cStem !== cNone;
const lemmaChg = cLemma !== cNone;
tr.append(html`<td>${i+1}</td>`);
tr.append(html`<td class="mono">${r.orig}</td>`);
tr.append(html`<td class="mono">${cNone}</td>`);
tr.append(html`<td class="mono ${stemChg?'chg':''}">${cStem}</td>`);
tr.append(html`<td class="mono ${lemmaChg?'chg':''}">${cLemma}</td>`);
tb.append(tr);
});
tbl.append(tb);
const boxM = html`<div></div>`;
boxM.append(html`<div class="h">Normalization Mapping (unigram only)</div>`);
boxM.append(html`<div class="hint" style="margin-bottom:6px">ไฮไลต์สีส้ม = คำที่เปลี่ยนหลังทำ Stemming/Lemmatization</div>`);
boxM.append(tbl);
tabsB.append(boxM);
}
// Footer stats
const freqInfo = html`<div style="margin-top:8px" class="hint"></div>`;
freqInfo.textContent = `Total tokens: ${baseTokens.length} · Unique: ${new Set(baseTokens).size}`;
freqEl.innerHTML = "";
freqEl.append(freqInfo);
}
renderTabs(); renderBody();
// ============== Wiring ==============
[ta, ckLower, ckPunct, ckStop, normSel, ngSel].forEach(el => el.addEventListener("input", renderBody));
return box;
})();(async () => {
// ================= Layout & Styles =================
const box = html`<div style="max-width:1120px;font:14px system-ui,-apple-system,Segoe UI,Roboto,sans-serif;color:#0f172a">
<style>
.wrap{display:grid;grid-template-columns:360px 1fr;gap:14px;align-items:start}
.side{border:1px solid #cbd5e1;border-radius:12px;padding:12px;background:#f8fafc}
.main{display:grid;gap:10px}
.h{font-weight:700;margin:2px 0 6px}
.group{border:1px dashed #cbd5e1;border-radius:10px;background:#fff;padding:10px;margin-bottom:10px}
.group>.title{font-weight:700;margin-bottom:6px}
.hint{color:#475569;font-size:12px}
.mono{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.pill, .tabs-head button{
display:inline-block;padding:8px 12px;border-radius:999px;border:1px solid #cbd5e1;background:#fff;cursor:pointer;
position:relative;overflow:hidden;transition:transform .08s ease, box-shadow .2s ease, background .15s ease;
user-select:none
}
.pill:active, .tabs-head button:active{ transform:scale(0.98) }
.btn-raised:hover{ box-shadow:0 2px 10px rgba(2,6,23,.08) }
/* Ripple */
.ripple{
position:absolute;border-radius:999px;transform:scale(0);opacity:.35;pointer-events:none;background:#0ea5e9;
animation:ripple .5s ease-out forwards;
}
@keyframes ripple{ to{ transform:scale(8); opacity:0; } }
textarea.ta{width:100%;min-height:100px;border:1px solid #cbd5e1;border-radius:8px;padding:8px;font:13px/1.4 ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
select.sel{width:100%;padding:8px 10px;border:1px solid #cbd5e1;border-radius:10px;background:#fff}
.row{display:flex;gap:8px;flex-wrap:wrap;align-items:flex-end}
.col{min-width:210px}
.chips{display:flex;gap:6px;flex-wrap:wrap}
.badge{display:inline-block;padding:2px 8px;border-radius:999px;background:#e2e8f0;font-size:12px}
.tabs{border:1px solid #cbd5e1;border-radius:12px;background:#fff}
.tabs-head{display:flex;gap:6px;padding:6px;border-bottom:1px solid #e2e8f0}
.tabs-head button.active{color:#fff;border-color:transparent;background:#0369a1}
.tabs-body{padding:10px}
.tbl{border-collapse:collapse;width:100%}
.tbl th,.tbl td{border:1px solid #cbd5e1;padding:6px 8px;text-align:center;vertical-align:top}
.tbl th{background:#f1f5f9}
.kpi{display:grid;grid-template-columns:repeat(5,minmax(0,1fr));gap:6px}
.k{border:1px solid #cbd5e1;border-radius:10px;background:#fff;padding:6px;min-height:40px}
.k b{display:block;font-size:13px;line-height:1.1}
.k div{font-size:11px;color:#475569}
.tokens{line-height:1.9}
.tok{display:inline-block;margin:2px 3px;padding:2px 6px;border-radius:8px;border:1px solid #e2e8f0}
.call{border:1px solid #cbd5e1;border-left:4px solid #0369a1;background:#f8fafc;border-radius:10px;padding:10px}
.heat{border:1px solid #cbd5e1;border-radius:10px;overflow:hidden}
.legend{font-size:12px;color:#475569}
.small{font-size:12px}
</style>
<div class="wrap">
<div class="side">
<div class="h">Feature Controls</div>
<div class="group">
<div class="title">1) Corpus</div>
<div id="ctrl-corpus"></div>
<div class="hint" style="margin-top:6px">เลือกชุดตัวอย่าง หรือแก้ไข/เพิ่มเอกสารเองได้</div>
</div>
<div class="group">
<div class="title">2) Preprocess</div>
<div id="ctrl-pre"></div>
</div>
<div class="group">
<div class="title">3) Normalization & N-grams</div>
<div id="ctrl-norm"></div>
<div id="ctrl-ng" style="margin-top:6px"></div>
</div>
<div class="group">
<div class="title">4) Feature Type</div>
<div id="ctrl-feat"></div>
</div>
<div class="group">
<div class="title">5) Preview</div>
<div id="ctrl-prev"></div>
</div>
</div>
<div class="main">
<div class="h">Interactive Feature Extraction Demo</div>
<div class="kpi" id="kpi"></div>
<div class="tabs">
<div class="tabs-head" id="tabs-h"></div>
<div class="tabs-body" id="tabs-b"></div>
</div>
<div class="call">
<b>Pipeline Summary</b>
<div id="pipe" class="mono small" style="margin-top:6px"></div>
</div>
</div>
</div>
</div>`;
// Ripple for ALL buttons
function makeRipple(btn, ev){
const rect = btn.getBoundingClientRect();
const r = Math.max(rect.width, rect.height);
const s = html`<span class="ripple"></span>`;
const x = (ev ? ev.clientX - rect.left : rect.width/2) - r/2;
const y = (ev ? ev.clientY - rect.top : rect.height/2) - r/2;
s.style.width = s.style.height = r + "px";
s.style.left = x + "px";
s.style.top = y + "px";
btn.appendChild(s);
setTimeout(()=> s.remove(), 600);
}
box.addEventListener("click", e=>{
const b = e.target.closest("button");
if(!b || !box.contains(b)) return;
makeRipple(b, e);
});
// ---------- Mount points ----------
const ctrlCorpus = box.querySelector("#ctrl-corpus");
const ctrlPre = box.querySelector("#ctrl-pre");
const ctrlNorm = box.querySelector("#ctrl-norm");
const ctrlNg = box.querySelector("#ctrl-ng");
const ctrlFeat = box.querySelector("#ctrl-feat");
const ctrlPrev = box.querySelector("#ctrl-prev");
const tabsH = box.querySelector("#tabs-h");
const tabsB = box.querySelector("#tabs-b");
const kpi = box.querySelector("#kpi");
const pipeEl = box.querySelector("#pipe");
// ================== Examples (corpus) ==================
const CORPUS = {
"Movie reviews (mini)":[
"I absolutely loved the movie — funny, charming, and surprisingly moving.",
"The plot was predictable, but the soundtrack was excellent.",
"Unfunny jokes and slow pacing made it boring.",
"A touching finale that left me smiling.",
"Average overall; strong visuals but uneven writing."
],
"Product reviews (mini)":[
"Battery lasts two days; camera is sharp but performance stutters.",
"Great build quality and the speakers are loud.",
"Disappointing — the screen cracked and support was unhelpful.",
"Good value for money and fast charging.",
"Works as advertised, nothing special."
],
"Support emails":[
"My account was charged twice this month; please help with a refund.",
"I can't reset my password after the update.",
"The app crashes whenever I upload photos.",
"How can I change the email associated with my account?",
"The invoice download button is missing."
]
};
// Controls — corpus
const corpusSel = Inputs.select(Object.keys(CORPUS), {label:"Preset"});
const area = html`<textarea class="ta" placeholder="One document per line..."></textarea>`;
const applyPreset = html`<button class="pill btn-raised">Load preset</button>`;
const addDoc = html`<button class="pill btn-raised">Add empty doc</button>`;
const resetDocs = html`<button class="pill btn-raised">Clear</button>`;
ctrlCorpus.append(corpusSel, area, html`<div class="row" style="margin-top:6px"></div>`);
ctrlCorpus.lastChild.append(applyPreset, addDoc, resetDocs);
function setDocs(lines){ area.value = lines.join("\n"); }
setDocs(CORPUS[corpusSel.value]);
applyPreset.onclick = ()=> setDocs(CORPUS[corpusSel.value]);
addDoc.onclick = ()=> { area.value = (area.value.trim()? area.value+"\n": "") + ""; };
resetDocs.onclick = ()=> { area.value = ""; };
// Controls — preprocess
const ckLower = Inputs.toggle({label:"Lowercase", value:true});
const ckPunct = Inputs.toggle({label:"Remove punctuation", value:true});
const ckStop = Inputs.toggle({label:"Remove English stopwords", value:false});
ckLower.classList.add("btn-raised"); ckPunct.classList.add("btn-raised"); ckStop.classList.add("btn-raised");
ctrlPre.append(ckLower, ckPunct, ckStop);
// Normalization + n-gram
const normSel = Inputs.radio(["None","Stemming","Lemmatization"], {label:"Normalization", value:"None"});
const ngSel = Inputs.select(["1 (unigram)","1–2 (uni+bi)"], {label:"N-grams", value:"1 (unigram)"});
ctrlNorm.append(normSel); ctrlNg.append(ngSel);
// Feature type & options
const featSel = Inputs.radio(["BoW (Count)","TF","TF–IDF"], {label:"Feature type", value:"TF–IDF"});
const l2Norm = Inputs.toggle({label:"L2 normalize rows (for cosine)", value:true});
ctrlFeat.append(featSel, l2Norm);
// Preview options
const topKCols = Inputs.range([6, 30], {label:"Top feature columns to preview", value:12, step:2});
const maxRows = Inputs.range([3, 30], {label:"Rows to preview", value:10, step:1});
ctrlPrev.append(topKCols, maxRows);
// ================== NLP helpers ==================
const EN_STOP = new Set([
"a","an","and","are","as","at","be","but","by","for","if","in","into","is","it","its",
"of","on","or","such","t","that","the","their","then","there","these","they","this",
"to","was","will","with","you","your","i","me","my","we","our","he","she","them","his","her"
]);
function stem(w){
if(w.length<=3) return w;
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/s$/.test(w) && !/ss$/.test(w)) w = w.replace(/s$/,"");
if(/eed$/.test(w)) return w.replace(/eed$/,"ee");
if(/(ed|ing)$/.test(w)){
let base = w.replace(/(ed|ing)$/,"");
if(/(at|bl|iz)$/.test(base)) return base + "e";
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) return base.slice(0,-1);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
const LEMMA_EX = new Map(Object.entries({
better:"good", best:"good", worse:"bad", worst:"bad",
did:"do", does:"do", done:"do", doing:"do",
has:"have", had:"have",
engaging:"engage", making:"make", running:"run", studied:"study",
children:"child", mice:"mouse", people:"person", batteries:"battery", stories:"story"
}));
function undoubleLast(s){ return s.replace(/(bb|dd|ff|gg|ll|mm|nn|pp|rr|tt)$/,"$1".slice(0,1)); }
function addSilentE(s){ return s + "e"; }
function lemma(w){
if(w.length<=2) return w;
if(LEMMA_EX.has(w)) return LEMMA_EX.get(w);
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/xes$|zes$|ches$|shes$/.test(w)) return w.replace(/es$/,"");
if(/s$/.test(w) && !/ss$/.test(w) && !/(us|is)$/.test(w)) return w.replace(/s$/,"");
if(/ied$/.test(w) && w.length>4) return w.replace(/ied$/,"y");
if(/ed$/.test(w) && w.length>3){
let base = w.replace(/ed$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ing$/.test(w) && w.length>4){
let base = w.replace(/ing$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
function tokenizeOne(txt){
let t = ckLower.value ? txt.toLowerCase() : txt;
if(ckPunct.value){
t = t.replace(/[\u2013\u2014]/g," ");
t = t.replace(/[^a-zA-Z0-9' ]+/g," ");
}
let toks = t.trim().split(/\s+/).filter(Boolean);
if(ckStop.value) toks = toks.filter(w=>!EN_STOP.has(w));
if(normSel.value==="Stemming") toks = toks.map(stem);
if(normSel.value==="Lemmatization") toks = toks.map(lemma);
if(ngSel.value==="1–2 (uni+bi)"){
const bi=[]; for(let i=0;i+1<toks.length;i++) bi.push(toks[i]+"_"+toks[i+1]);
toks = toks.concat(bi);
}
return toks;
}
function vectorize(docs){
// Build vocab + df
const vocab = new Map(); let vidx=0;
const D = docs.length;
const toksPerDoc = docs.map(tokenizeOne);
const df = new Map();
for(const toks of toksPerDoc){
const uniq = new Set(toks);
for(const w of toks){ if(!vocab.has(w)) vocab.set(w, vidx++); }
for(const w of uniq){ df.set(w, (df.get(w)||0)+1); }
}
const V = vocab.size;
// idf
const idf = new Float64Array(V).fill(0);
vocab.forEach((j,w)=>{ idf[j] = Math.log((D+1)/((df.get(w)||0)+1)) + 1; });
// rows
const X = [];
for(const toks of toksPerDoc){
const counts = {};
for(const w of toks){ const j = vocab.get(w); counts[j] = (counts[j]||0)+1; }
const n = toks.length || 1;
const row = {};
for(const jStr in counts){
const j = +jStr;
let val = counts[j];
if(featSel.value==="TF") val = val / n;
if(featSel.value==="TF–IDF") val = (counts[j]/n) * idf[j];
row[j] = val;
}
if(l2Norm.value){
let s=0; for(const jStr in row){ const v=row[+jStr]; s += v*v; }
const z = Math.sqrt(s)||1; for(const jStr in row){ const j=+jStr; row[j] = row[j]/z; }
}
X.push(row);
}
return {X, V, vocab, idf, toksPerDoc, D};
}
function cosine(a,b){
let s=0; for(const jStr in a){ const j=+jStr; if(b[j]!=null) s+= a[j]*b[j]; }
// if rows are l2 normalized, denominator ~1
if(l2Norm.value) return s;
let na=0, nb=0; for(const jStr in a){ const v=a[+jStr]; na+=v*v; } for(const jStr in b){ const v=b[+jStr]; nb+=v*v; }
return s/((Math.sqrt(na)||1)*(Math.sqrt(nb)||1));
}
// ================== Tabs ==================
const TABS = [
["Vocabulary","🔡"],
["Matrix (structural)","🧱"],
["Doc vectors (non-zeros)","📄"],
["Cosine similarity","📈"]
];
let active = 0;
function renderTabs(){ tabsH.innerHTML=""; TABS.forEach(([t,ic],i)=>{ const b=html`<button class="${i===active?'active':''}">${ic} ${t}</button>`; b.onclick=()=>{ active=i; renderAll(); renderTabs(); }; tabsH.append(b); }); }
// ================== Render ==================
function parseDocs(){
const docs = area.value.split(/\r?\n/).map(s=>s.trim()).filter(s=>s.length>0);
return docs;
}
function build(){
const docs = parseDocs();
const vec = vectorize(docs);
const inv = []; vec.vocab.forEach((j,w)=> inv[j]=w);
return {docs, vec, inv};
}
function renderKPI(state){
const {docs, vec} = state;
kpi.innerHTML = "";
const add = (label, val, sub="")=>{
const el = html`<div class="k"></div>`;
el.append(html`<b>${val}</b>`, html`<div>${label}</div>`);
if(sub) el.append(html`<div class="hint">${sub}</div>`);
kpi.append(el);
};
add("Documents", docs.length);
add("Vocabulary size", vec.V);
add("Feature", featSel.value);
add("N-grams", ngSel.value==="1 (unigram)"?"unigram":"uni+bi");
add("L2 norm", l2Norm.value?"on":"off");
}
function renderPipe(){
const parts = [
ckLower.value?"lowercase":"(keep case)",
ckPunct.value?"rm_punct":"(keep punct)",
ckStop.value? "rm_stop":"(keep stop)",
normSel.value==="None"?"no_norm": normSel.value.toLowerCase(),
ngSel.value==="1 (unigram)"?"unigram":"unigram+bigram",
featSel.value + (l2Norm.value?" + L2":"")
];
pipeEl.textContent = parts.join(" > ");
}
function renderVocab(state){
const {vec, inv} = state;
// term frequency across docs (column density)
const colFreq = new Float64Array(vec.V).fill(0);
for(const row of vec.X){ for(const jStr in row){ const j=+jStr; colFreq[j]+=1; } }
const order = Array.from({length:vec.V}, (_,j)=>j).sort((a,b)=> colFreq[b]-colFreq[a] || (inv[a]>inv[b]?1:-1));
const top = order.slice(0, Math.min(vec.V, 200));
const tbl = html`<table class="tbl"></table>`;
tbl.append(html`<thead><tr><th style="width:52px">#</th><th>Token</th><th style="width:100px">Doc freq</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
top.forEach((j,i)=> tb.append(html`<tr><td>${i+1}</td><td class="mono">${inv[j]}</td><td>${colFreq[j]|0}</td></tr>`));
tbl.append(tb);
tabsB.innerHTML=""; tabsB.append(html`<div class="h">Vocabulary (top by document frequency)</div>`, tbl);
}
function renderMatrix(state){
const {docs, vec, inv} = state;
// choose columns by column frequency
const colFreq = new Float64Array(vec.V).fill(0);
for(const row of vec.X){ for(const jStr in row){ const j=+jStr; colFreq[j]+=1; } }
const order = Array.from({length:vec.V}, (_,j)=>j).sort((a,b)=> colFreq[b]-colFreq[a] || (inv[a]>inv[b]?1:-1));
const cols = order.slice(0, topKCols.value|0);
const rows = docs.slice(0, maxRows.value|0);
const tbl = html`<table class="tbl"></table>`;
const thead = html`<thead><tr><th>#</th><th style="text-align:left">Doc</th>${cols.map(j=> html`<th class="mono">${inv[j]}</th>`)}</tr></thead>`;
const tb = html`<tbody></tbody>`;
rows.forEach((d,i)=>{
const tr = html`<tr><td>${i+1}</td><td style="text-align:left">${d.slice(0,80)}</td></tr>`;
const row = vec.X[i];
for(const j of cols) tr.append(html`<td class="mono">${(row[j]||0).toFixed(3)}</td>`);
tb.append(tr);
});
tbl.append(thead, tb);
tabsB.innerHTML=""; tabsB.append(html`<div class="h">Document–Term Matrix (preview)</div>`, tbl, html`<div class="hint" style="margin-top:6px">ค่าขึ้นกับประเภทฟีเจอร์: BoW/TF/TF–IDF และ L2 normalization</div>`);
}
function renderVectors(state){
const {docs, vec, inv} = state;
const cont = html`<div></div>`;
cont.append(html`<div class="h">Non-zero features per document</div>`);
docs.forEach((d,i)=>{
const row = vec.X[i];
const nz = Object.entries(row).map(([jStr,v])=>({tok: inv[+jStr], val: v})).sort((a,b)=> Math.abs(b.val)-Math.abs(a.val));
const tbl = html`<table class="tbl" style="margin-bottom:8px"></table>`;
tbl.append(html`<thead><tr><th colspan="3" style="text-align:left">Doc ${i+1} — <span class="small">${d.slice(0,90)}</span></th></tr><tr><th style="width:52px">#</th><th>Token</th><th style="width:120px">Value</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
nz.slice(0, 40).forEach((e,idx)=> tb.append(html`<tr><td>${idx+1}</td><td class="mono">${e.tok}</td><td class="mono">${e.val.toFixed(4)}</td></tr>`));
tbl.append(tb);
cont.append(tbl);
});
tabsB.innerHTML=""; tabsB.append(cont);
}
function renderCosine(state){
const {docs, vec} = state;
const n = docs.length;
const M = Array.from({length:n}, ()=> new Array(n).fill(0));
for(let i=0;i<n;i++) for(let j=0;j<n;j++) M[i][j] = cosine(vec.X[i], vec.X[j]);
const tbl = html`<table class="tbl"></table>`;
const thead = html`<thead><tr><th></th>${docs.map((_,j)=> html`<th>D${j+1}</th>`)}</tr></thead>`;
const tb = html`<tbody></tbody>`;
for(let i=0;i<n;i++){
const tr = html`<tr><th style="text-align:left">D${i+1}</th></tr>`;
for(let j=0;j<n;j++) tr.append(html`<td>${M[i][j].toFixed(3)}</td>`);
tb.append(tr);
}
tbl.append(thead, tb);
tabsB.innerHTML=""; tabsB.append(html`<div class="h">Cosine similarity (documents)</div>`, tbl, html`<div class="legend" style="margin-top:6px">ค่าสูง = เอกสารคล้ายกันมาก (ขึ้นกับฟีเจอร์ + normalization)</div>`);
}
function renderActive(state){
if(active===0) return renderVocab(state);
if(active===1) return renderMatrix(state);
if(active===2) return renderVectors(state);
if(active===3) return renderCosine(state);
}
function renderAll(){
const state = build();
renderKPI(state);
renderPipe();
renderActive(state);
}
// Tab header
renderTabs();
// Initial draw
renderAll();
// Wire inputs
[
corpusSel, area, applyPreset, addDoc, resetDocs,
ckLower, ckPunct, ckStop, normSel, ngSel,
featSel, l2Norm, topKCols, maxRows
].forEach(el => el.addEventListener("input", renderAll));
return box;
})();(async () => {
// ===================== Layout & Styles =====================
const box = html`<div style="max-width:1120px;font:14px system-ui, -apple-system, Segoe UI, Roboto, sans-serif; color:#0f172a;">
<style>
.sa-wrap{
display:grid;
grid-template-columns:360px 1fr;
gap:14px;
align-items:start;
}
.sa-side{
border:1px solid #cbd5e1;border-radius:12px;padding:12px;background:#f8fafc
}
/* pull up right column (global) */
.sa-main{
display:grid;
gap:4px;
margin-top:10px;
}
.sa-h{font-weight:700;margin:4px 0 6px}
.group{border:1px dashed #cbd5e1;border-radius:10px;padding:10px;background:#fff;margin-bottom:10px}
.group > .title{font-weight:700;margin-bottom:6px}
/* Stepper */
.sa-stepper{ display:flex; gap:4px; flex-wrap:wrap; margin-top:0; margin-bottom:2px; }
.sa-stepper button{
padding:2px 8px; border:1px solid #cbd5e1; border-radius:10px; background:#fff;
cursor:pointer; transition:all .15s; font-size:12px; line-height:1.1; height:22px; min-height:22px;
}
.sa-stepper button.active{ color:#fff; border-color:transparent; }
/* KPI: compact */
#kpi{
margin:0; padding:0;
display:grid; grid-template-columns:repeat(5,minmax(0,1fr));
gap:6px; transform:translateY(-4px);
}
.k{
border:1px solid #cbd5e1; border-radius:10px; background:#fff;
padding:3px 6px; min-height:26px;
display:flex; flex-direction:column; justify-content:center;
}
.k b{ display:block; font-size:13px; margin-bottom:1px; line-height:1.1; }
.k div{ font-size:11px; line-height:1.05; }
#view{ margin-top:0; }
.tbl{border-collapse:collapse;width:100%}
.tbl th,.tbl td{border:1px solid #cbd5e1;padding:6px 8px;text-align:center}
.badge{display:inline-block;padding:2px 8px;border-radius:999px;background:#e2e8f0;margin-left:6px;font-size:12px}
.pill{display:inline-block;padding:8px 12px;border-radius:999px;border:1px solid #cbd5e1;background:#fff; cursor:pointer; position:relative; overflow:hidden; transition:transform .08s ease, box-shadow .2s ease}
.pill:active{ transform: scale(0.98) }
.hint{color:#475569;font-size:12px}
.mono{font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, "Liberation Mono", monospace;}
.small{font-size:12px}
.tokens{line-height:1.9}
.tok{display:inline-block;margin:2px 3px;padding:2px 6px;border-radius:8px;border:1px solid #e2e8f0}
.tok.pos{background:#ecfdf5;border-color:#bbf7d0}
.tok.neg{background:#fef2f2;border-color:#fecaca}
.legend{display:flex;gap:8px;align-items:center}
.boxpos{width:14px;height:14px;border-radius:4px;background:#ecfdf5;border:1px solid #bbf7d0}
.boxneg{width:14px;height:14px;border-radius:4px;background:#fef2f2;border:1px solid #fecaca}
/* Predict button effects */
.btn-predict.loading{ pointer-events:none; box-shadow:0 0 0 0 rgba(34,197,94,.0); }
.btn-predict.loading::after{
content:""; position:absolute; inset:0; border-radius:999px;
border:2px solid rgba(34,197,94,.35); border-left-color:transparent;
animation:spin .7s linear infinite; margin:2px;
}
.btn-predict.done{ box-shadow:0 0 0 6px rgba(34,197,94,.15); }
@keyframes spin{ to{ transform:rotate(360deg);} }
.ripple{ position:absolute; border-radius:999px; transform:scale(0); opacity:.35; pointer-events:none; background:#22c55e; animation:ripple .5s ease-out forwards; }
@keyframes ripple{ to{ transform:scale(8); opacity:0; } }
.callout{border:1px solid #cbd5e1;border-left:4px solid #0f766e;background:#f8fafc;border-radius:10px;padding:10px}
/* -------- Tabs for Step 6 -------- */
.tabs{ border:1px solid #cbd5e1; border-radius:12px; background:#fff; }
.tabs-head{ display:flex; gap:6px; padding:6px; border-bottom:1px solid #e2e8f0; }
.tabs-head button{
padding:6px 10px; border:1px solid #cbd5e1; background:#fff; border-radius:10px; cursor:pointer; font-size:12px;
}
.tabs-head button.active{ color:#fff; border-color:transparent; background:#0f766e; }
.tabs-body{ padding:8px 10px; }
.tabs-body .row{ display:flex; gap:8px; align-items:flex-end; flex-wrap:wrap; }
.tabs-body .col{ min-width:220px; }
textarea.ta{ width:100%; min-height:80px; border:1px solid #cbd5e1; border-radius:8px; padding:8px; font:12px/1.35 ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
select.sel, input[type="range"].rg{ width:260px; }
</style>
<div class="sa-wrap">
<div class="sa-side">
<div class="sa-h">Controls (by Process)</div>
<div class="group" id="g1"><div class="title">Process 1 — Data</div><div id="ctrl-data"></div></div>
<div class="group" id="g2"><div class="title">Process 2 — Tokenization</div><div id="ctrl-token"></div></div>
<div class="group" id="g3"><div class="title">Process 3 — Features</div><div id="ctrl-feature"></div></div>
<div class="group" id="g4"><div class="title">Process 4 — Model</div><div id="ctrl-model"></div></div>
<div class="group" id="g5"><div class="title">Process 5 — Evaluate</div><div id="ctrl-eval"></div></div>
<!-- Process 6 controls moved into Step 6 tabset on the right -->
<div class="group" id="g6"><div class="title">Process 6 — Apply</div>
<div class="hint">All inputs for Apply are now in the Step 6 tabset on the right.</div>
</div>
</div>
<div class="sa-main">
<div class="sa-stepper" id="steps"></div>
<div id="kpi" class="kpi"></div>
<div id="view"></div>
</div>
</div>
</div>`;
const steps = box.querySelector("#steps");
const kpi = box.querySelector("#kpi");
const view = box.querySelector("#view");
// Step colors (declare ONCE)
const stepColors = ["#1d4ed8","#059669","#d97706","#7c3aed","#dc2626","#0f766e"];
// ===================== Datasets =====================
const dsMovie = [
["positive","I loved this movie, it was fantastic and engaging."],
["positive","A brilliant story with great acting and direction."],
["positive","Heartwarming and fun—I would definitely watch again."],
["positive","Surprisingly good; the pacing and soundtrack were excellent."],
["positive","The cinematography was gorgeous and the ending satisfied me."],
["positive","A clever script that kept me smiling throughout."],
["negative","I hated this film; it was boring and predictable."],
["negative","Terrible acting and a weak script ruined it."],
["negative","Disappointing and way too long for such a simple plot."],
["negative","A mess from start to finish—do not recommend."],
["negative","Flat characters and clumsy dialogue dragged it down."],
["neutral","The movie was okay; some parts worked, others didn’t."],
["neutral","Average overall, neither exciting nor terrible."],
["positive","Clever dialogue and charming characters won me over."],
["negative","Unfunny jokes and annoying characters made it painful."],
["neutral","Mixed feelings: strong visuals but uneven pacing."]
];
const dsProduct = [
["positive","This phone is fast and the camera is amazing."],
["positive","Excellent build quality and battery life lasts all day."],
["positive","Great value for money; highly recommend this product."],
["positive","The speakers are loud and clear for such a small device."],
["positive","Smooth performance even with multiple apps open."],
["negative","The screen cracked easily and support was unhelpful."],
["negative","Slow performance and the battery drains quickly."],
["negative","Very disappointing; not worth the price at all."],
["negative","Frequent freezes and buggy software updates."],
["neutral","It works as advertised but nothing special."],
["neutral","Setup was fine; overall an average experience."],
["negative","Arrived late and missing accessories—frustrating."],
["neutral","Decent display, but the build feels a bit cheap."]
];
const dsSpam = [
["spam","WIN a free trip!!! Click now: http://prize.example"],
["spam","Congratulations! You have been selected for a reward."],
["spam","Limited offer!!! Claim your voucher today."],
["spam","Earn $$$ working from home. Join today!"],
["spam","Your parcel is pending. Confirm payment to receive."],
["ham","Hi team, the meeting is at 3 pm in room 204."],
["ham","Please review the attached agenda for tomorrow."],
["ham","Thanks for your help with the report last night."],
["ham","Could you confirm the shipment arrived on time?"],
["ham","Reminder: standup at 9:30 in the main room."]
];
const dsIntent = [
["sales","I'd like to upgrade to the premium plan."],
["sales","Do you offer discounts for students?"],
["sales","Can I get a demo of the enterprise tier?"],
["sales","What are the pricing options for annual billing?"],
["sales","Is there a free trial for teams?"],
["support","My account was charged twice last month."],
["support","I can't reset my password—please help."],
["support","The app crashes when I upload photos."],
["support","How do I change the email associated with my account?"],
["support","The invoice download button is missing on my page."]
];
const dsTopic = [
["sports","The striker scored a late winner in extra time."],
["sports","The coach announced the final squad for the cup."],
["sports","The goalkeeper saved two penalties in the shootout."],
["politics","Parliament passed the new budget after a long debate."],
["politics","The minister discussed trade policy with reporters."],
["politics","A committee proposed amendments to the tax bill."],
["tech","Researchers unveiled a prototype quantum processor."],
["tech","The company released a major update to its OS."],
["tech","A startup raised funding to build AI safety tools."]
];
function getDataset(kind){
if(kind==="Sentiment — Movie") return dsMovie.slice();
if(kind==="Sentiment — Product") return dsProduct.slice();
if(kind==="Spam Detection") return dsSpam.slice();
if(kind==="Intent Classification") return dsIntent.slice();
return dsTopic.slice();
}
// default sample text
function defaultSampleForTask(kind){
if(kind==="Sentiment — Movie") return "Surprisingly engaging story but the pacing felt slow.";
if(kind==="Sentiment — Product") return "Great camera, but the battery drains too quickly.";
if(kind==="Spam Detection") return "Claim your limited offer now and win a free voucher!";
if(kind==="Intent Classification") return "My card was charged twice—can you help?";
return "The parliament discussed a new tech policy today.";
}
const SAMPLE_BANK = {
"Sentiment — Movie": [
"Brilliant cast but the plot twists were predictable.",
"A touching finale that left me smiling.",
"The humor didn’t land and the pacing dragged.",
"Visually stunning though emotionally distant.",
"Mediocre overall—not bad, not great."
],
"Sentiment — Product": [
"Battery lasts two days and charges quickly.",
"Screen is sharp, but performance stutters.",
"Solid value for the price tag.",
"Speaker quality is underwhelming for calls.",
"Love the design though the back scratches easily."
],
"Spam Detection": [
"You won a prize! Click to claim now!",
"Update: project brief attached for review.",
"Final notice: verify your payment information immediately.",
"Let’s reschedule our 1:1 to Friday.",
"Exclusive voucher awaits—limited time!"
],
"Intent Classification": [
"Can I switch to a monthly plan instead?",
"The app locked me out after the update.",
"Is there a discount for nonprofits?",
"I need help exporting my billing history.",
"Could we schedule a demo next week?"
],
"Topic Classification": [
"Developers shipped a patch to fix kernel bugs.",
"The striker missed a penalty but scored later.",
"Lawmakers debated reforms to the election rules.",
"The coach confirmed two injuries before the match.",
"A research lab announced advances in photonics."
]
};
// ===================== Controls (left) =====================
const taskSel = Inputs.select(["Sentiment — Movie","Sentiment — Product","Spam Detection","Intent Classification","Topic Classification"], {label:"Demo task"});
const splitSlider = Inputs.range([50, 90], {label:"Train split (%)", step:5, value:70});
const seedInput = Inputs.number({label:"Random seed", value:123, step:1, min:1, max:999999});
const resplitBtn = html`<button class="pill">Shuffle & Split</button>`;
box.querySelector("#ctrl-data").append(taskSel, splitSlider, seedInput, resplitBtn);
const lowerCk = Inputs.toggle({label:"Lowercase", value:true});
const punctCk = Inputs.toggle({label:"Remove punctuation", value:true});
const stopCk = Inputs.toggle({label:"Remove stopwords (EN)", value:false});
const ngramSel= Inputs.select(["1 (unigram)","1–2 (uni+bi)"], {label:"N-grams"});
const normSel = Inputs.radio(["None","Stemming","Lemmatization"], {label:"Normalization", value:"None"});
box.querySelector("#ctrl-token").append(lowerCk, punctCk, stopCk, ngramSel, normSel);
const featSel = Inputs.radio(["BoW (count)","TF","TF–IDF"], {value:"TF–IDF", label:"Feature type"});
const showStruct = Inputs.toggle({label:"Show structural features (X preview)", value:true});
const topKCols = Inputs.range([6, 20], {label:"Top feature columns to preview", step:2, value:12});
const rowsPreview = Inputs.range([4, 20], {label:"Rows to preview", step:2, value:10});
box.querySelector("#ctrl-feature").append(featSel, showStruct, topKCols, rowsPreview);
function modelDisplayName(opts){
const a = (+opts.alphaEff ?? +opts.alpha).toFixed(2);
return `Naive Bayes (α=${a})`;
}
const modelInfo = html`<div style="margin-bottom:8px">
<div class="hint">Current model</div>
<div id="model-name" style="font-weight:600">Naive Bayes (α=1.00)</div>
</div>`;
const alphaMode = Inputs.radio(["Manual α","Best α (grid)"], {label:"Alpha mode", value:"Manual α"});
const alphaRg = Inputs.range([0.1, 3], {label:"NB smoothing α", step:0.1, value:1.0});
const alphaList = Inputs.text({label:"Candidate α (comma)", value:"0.1,0.3,0.5,1.0,1.5,2.0,3.0"});
const btnFindBest = html`<button class="pill">Find best α</button>`;
const bestInfo = html`<div class="hint" id="best-alpha-hint" style="margin-top:6px">Best α: —</div>`;
box.querySelector("#ctrl-model").append(modelInfo, alphaMode, alphaRg, alphaList, btnFindBest, bestInfo);
box.querySelector("#ctrl-eval").append(html`<div class="hint">Open the <b>Step 5 — Evaluate</b> tab to see confusion matrix & metrics.</div>`);
// ===================== State for Step 6 inputs (moved to right) =====================
let applyTextVal = defaultSampleForTask(taskSel.value);
let topKTokVal = 12;
taskSel.addEventListener("input", ()=>{
applyTextVal = defaultSampleForTask(taskSel.value);
predicted = false;
});
// ===================== Stepper =====================
const stepNames = ["1) Data","2) Tokenization","3) Features","4) Train","5) Evaluate","6) Apply"];
let step = 1;
function renderStepper(){
steps.innerHTML = "";
stepNames.forEach((name,i)=>{
const b = html`<button>${name}</button>`;
const c = stepColors[i];
b.style.borderColor = c + "55";
if(i+1===step){ b.classList.add("active"); b.style.background = c; }
else { b.onmouseenter = ()=>{ b.style.background = c+"22"; }; b.onmouseleave = ()=>{ b.style.background = "#fff"; }; }
b.onclick = ()=>{ step=i+1; refreshView(); };
steps.append(b);
});
}
renderStepper();
// ===================== Tokenizers / Normalizers =====================
const EN_STOP = new Set([
"a","an","and","are","as","at","be","but","by","for","if","in","into","is","it","its",
"of","on","or","such","t","that","the","their","then","there","these","they","this",
"to","was","will","with","you","your","i","me","my","we","our","he","she","them","his","her"
]);
function lightStemWord(w){
if(w.length<=3) return w;
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/s$/.test(w) && !/ss$/.test(w)) w = w.replace(/s$/,"");
if(/eed$/.test(w)) return w.replace(/eed$/,"ee");
if(/(ed|ing)$/.test(w)){
let base = w.replace(/(ed|ing)$/,"");
if(/(at|bl|iz)$/.test(base)) return base + "e";
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) return base.slice(0,-1);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
const LEMMA_EX = new Map(Object.entries({
"better":"good","best":"good","worse":"bad","worst":"bad",
"did":"do","does":"do","done":"do","doing":"do",
"has":"have","had":"have",
"engaging":"engage","making":"make","running":"run","studied":"study",
"children":"child","mice":"mouse","people":"person","batteries":"battery","stories":"story"
}));
function undoubleLast(cons){ return cons.replace(/(bb|dd|ff|gg|ll|mm|nn|pp|rr|tt)$/,"$1".slice(0,1)); }
function addSilentE(base){ return base + "e"; }
function smartLemmaWord(w){
if(w.length<=2) return w;
if(LEMMA_EX.has(w)) return LEMMA_EX.get(w);
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/xes$|zes$|ches$|shes$/.test(w)) return w.replace(/es$/,"");
if(/s$/.test(w) && !/ss$/.test(w) && !/(us|is)$/.test(w)) return w.replace(/s$/,"");
if(/ied$/.test(w) && w.length>4) return w.replace(/ied$/,"y");
if(/ed$/.test(w) && w.length>3){
let base = w.replace(/ed$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ing$/.test(w) && w.length>4){
let base = w.replace(/ing$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
function tokenizeBase(txt, opts){
let t = opts.lower ? txt.toLowerCase() : txt;
if(opts.punct) t = t.replace(/[\u2013\u2014]/g, " ").replace(/[^a-zA-Z0-9' ]+/g, " ");
let toks = t.trim().split(/\s+/).filter(Boolean);
if(opts.stop) toks = toks.filter(w=>!EN_STOP.has(w));
if(opts.norm==="Stemming") toks = toks.map(lightStemWord);
if(opts.norm==="Lemmatization") toks = toks.map(smartLemmaWord);
if(opts.ngram==="1–2 (uni+bi)"){
const bi=[]; for(let i=0;i+1<toks.length;i++) bi.push(toks[i]+"_"+toks[i+1]);
toks = toks.concat(bi);
}
return toks;
}
// ===================== Vectorize / Model =====================
function vectorize(docs, opts){
const vocab = new Map();
const D = docs.length;
let idx=0;
const tokensPerDoc = [];
const df = new Map();
for(const [,text] of docs){
const toks = tokenizeBase(text, opts);
tokensPerDoc.push(toks);
const uniq = new Set(toks);
for(const w of toks){ if(!vocab.has(w)) vocab.set(w, idx++); }
for(const w of uniq) df.set(w, (df.get(w)||0)+1);
}
const V = vocab.size;
const idf = {};
if(opts.feat==="TF–IDF"){
vocab.forEach((j,w)=>{ idf[j] = Math.log((D+1)/((df.get(w)||0)+1)) + 1; });
}
const X = [];
for(const toks of tokensPerDoc){
const counts = {};
for(const w of toks){ const j = vocab.get(w); counts[j] = (counts[j]||0)+1; }
const n = toks.length || 1;
const row = {};
for(const jStr in counts){
const j = +jStr; let val = counts[j];
if(opts.feat==="TF") val = val / n;
if(opts.feat==="TF–IDF") val = (counts[j]/n) * idf[j];
row[j] = val;
}
X.push(row);
}
return {X, vocab, idf, D, V, tokensPerDoc};
}
function trainNB(X, y, V, alpha){
const classes = Array.from(new Set(y));
const classToIdx = new Map(classes.map((c,i)=>[c,i]));
const N = y.length;
const priors = new Array(classes.length).fill(0);
const sumW = Array.from({length:classes.length}, ()=> new Float64Array(V).fill(0));
const totW = new Float64Array(classes.length).fill(0);
for(let i=0;i<N;i++){
const cIdx = classToIdx.get(y[i]);
priors[cIdx] += 1;
const row = X[i];
for(const jStr in row){ const j = +jStr, v = row[j]; sumW[cIdx][j] += v; totW[cIdx] += v; }
}
const logPrior = priors.map(nc => Math.log(nc / N));
const logLik = Array.from({length:classes.length}, ()=> new Float64Array(V).fill(0));
for(let c=0;c<classes.length;c++){
const denom = totW[c] + alpha * V;
for(let j=0;j<V;j++) logLik[c][j] = Math.log((sumW[c][j] + alpha) / denom);
}
return {kind:"NB", classes, classToIdx, logPrior, logLik};
}
function predictNB(model, row){
const {classes, logPrior, logLik} = model;
const scores = new Array(classes.length).fill(0).map((_,c)=>logPrior[c]);
for(const jStr in row){ const j=+jStr, v=row[j]; for(let c=0;c<classes.length;c++) scores[c]+= v*logLik[c][j]; }
const m = Math.max(...scores); const exps = scores.map(s=>Math.exp(s-m)); const Z = exps.reduce((a,b)=>a+b,0);
const probs = exps.map(e=>e/Z);
const maxIdx = probs.reduce((a,b,i)=> b>probs[a]?i:a, 0);
return {label: classes[maxIdx], probs, scores};
}
// ===================== State & Compute =====================
const seedRand = (s)=>()=> (s = Math.imul(48271, s) % 0x7fffffff) / 0x7fffffff;
function currentOptions(){
const cand = (alphaList.value || "")
.split(",").map(s=>parseFloat(s.trim()))
.filter(v=>Number.isFinite(v) && v>0);
return {
lower: lowerCk.value, punct: punctCk.value, stop: stopCk.value,
ngram: ngramSel.value, norm: normSel.value,
feat: featSel.value, showStruct: showStruct.value,
topKCols: topKCols.value|0, rowsPreview: rowsPreview.value|0,
model: "Naive Bayes",
alphaMode: alphaMode.value,
alpha: alphaRg.value,
alphaCandidates: cand.length ? cand : [0.1,0.3,0.5,1.0,1.5,2.0,3.0]
};
}
let cache = null;
let predicted = false;
function computeAll(){
const dataAll = getDataset(taskSel.value);
const opts = currentOptions();
// shuffle + split
const rnd = seedRand(seedInput.value|0);
const idx = dataAll.map((_,i)=>i).sort(()=> (rnd()<0.5?-1:1));
const cut = Math.floor((splitSlider.value/100)*dataAll.length);
const trainIdx = idx.slice(0, cut);
const testIdx = idx.slice(cut);
const train = trainIdx.map(i=>dataAll[i]);
const test = testIdx.map(i=>dataAll[i]);
// vectorize on train
const vecTrain = vectorize(train, opts);
const {V} = vecTrain;
function toRowGeneric(text){
const toks = tokenizeBase(text, opts);
const counts = {};
for(const w of toks){ if(!vecTrain.vocab.has(w)) continue; const j = vecTrain.vocab.get(w); counts[j] = (counts[j]||0)+1; }
const n = toks.length || 1;
const row = {};
for(const jStr in counts){
const j = +jStr; let val = counts[j];
if(opts.feat==="TF") val = val / n;
if(opts.feat==="TF–IDF") val = (counts[j]/n) * (vecTrain.idf[j] || 1);
row[j] = val;
}
return row;
}
// grid search alpha (80/20 inner split)
const innerCut = Math.max(1, Math.floor(train.length * 0.8));
const innerTrain = train.slice(0, innerCut);
const innerVal = train.slice(innerCut);
function evalAlpha(alpha){
const X=[], y=[];
for(const [lab, text] of innerTrain){ X.push(toRowGeneric(text)); y.push(lab); }
const model = trainNB(X, y, V, alpha);
let correct=0;
for(const [trueLab, text] of innerVal){
const row = toRowGeneric(text);
const pred = predictNB(model, row);
if(pred.label===trueLab) correct++;
}
const acc = innerVal.length ? correct/innerVal.length : 1;
return {alpha, acc};
}
let alphaEff = opts.alpha;
let bestAlphaInfo = null;
if(opts.alphaMode === "Best α (grid)"){
const results = opts.alphaCandidates.map(a=>evalAlpha(a));
results.sort((a,b)=> b.acc - a.acc || a.alpha - b.alpha);
const best = results[0] || {alpha: alphaEff, acc: 0};
alphaEff = best.alpha;
bestAlphaInfo = best;
}
// final train
const Xfull=[], yfull=[];
for(const [lab, text] of train){ Xfull.push(toRowGeneric(text)); yfull.push(lab); }
const model = trainNB(Xfull, yfull, V, alphaEff);
function toRow(text){
const toks = tokenizeBase(text, opts);
const counts = {};
for(const w of toks){ if(!vecTrain.vocab.has(w)) continue; const j = vecTrain.vocab.get(w); counts[j] = (counts[j]||0)+1; }
const n = toks.length || 1;
const row = {};
for(const jStr in counts){
const j = +jStr; let val = counts[j];
if(opts.feat==="TF") val = val / n;
if(opts.feat==="TF–IDF") val = (counts[j]/n) * (vecTrain.idf[j] || 1);
row[j] = val;
}
return {row, toks};
}
// evaluate
const classes = Array.from(new Set([...train.map(d=>d[0]), ...test.map(d=>d[0])]));
const Lidx = new Map(classes.map((c,i)=>[c,i]));
const cm = Array.from({length:classes.length}, ()=> new Array(classes.length).fill(0));
let correct=0;
for(const [trueLab, text] of test){
const {row} = toRow(text);
const pred = predictNB(model,row);
if(pred.label===trueLab) correct++;
cm[Lidx.get(trueLab)][Lidx.get(pred.label)]++;
}
const acc = test.length? (correct/test.length): 1;
const metrics = classes.map((c, i)=>{
const tp = cm[i][i];
const fn = cm[i].reduce((a,b)=>a+b,0) - tp;
let fp = 0; for(let r=0;r<classes.length;r++) fp += cm[r][i]; fp -= tp;
const prec = tp + fp === 0 ? 1 : tp/(tp+fp);
const rec = tp + fn === 0 ? 1 : tp/(tp+fn);
const f1 = (prec+rec===0) ? 0 : (2*prec*rec)/(prec+rec);
return {label:c, prec, rec, f1};
});
const macroF1 = metrics.reduce((a,m)=>a+m.f1,0)/(metrics.length||1);
cache = {
opts: {...opts, alphaEff},
train, test, vecTrain, model, classes, cm, acc, metrics, macroF1, toRow,
bestAlphaInfo
};
const hint = box.querySelector("#best-alpha-hint");
if(hint){
if(opts.alphaMode === "Best α (grid)" && cache.bestAlphaInfo){
hint.textContent = `Best α: ${cache.bestAlphaInfo.alpha} (val acc ${(cache.bestAlphaInfo.acc*100).toFixed(1)}%)`;
} else {
hint.textContent = "Best α: —";
}
}
}
// ===================== KPI / Views =====================
function refreshModelName(){
if(!cache) return;
const el = box.querySelector("#model-name");
if(el) el.textContent = modelDisplayName(cache.opts);
}
function refreshKPI(){
const {train, test, vecTrain, acc, opts} = cache;
kpi.innerHTML = "";
const makeK = (title, val, sub="") => html`<div class="k"><b>${val}</b><div>${title}</div>${sub?html`<div class="hint">${sub}</div>`:""}</div>`;
kpi.append(
makeK("Train samples", train.length),
makeK("Test samples", test.length),
makeK("Vocab size", vecTrain.V),
makeK("Feature", opts.feat),
makeK("Accuracy (test)", (acc*100).toFixed(1)+"%")
);
}
function renderData(){
const {train, test, opts} = cache;
const sec = html`<div></div>`;
sec.append(html`<div class="sa-h">Step 1 — Data</div>`);
const tbl = html`<table class="tbl small"></table>`;
tbl.append(html`<thead><tr><th>Set</th><th>Label</th><th>Text</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
for(const [lab, txt] of train) tb.append(html`<tr><td>train</td><td><span class="badge">${lab}</span></td><td>${txt}</td></tr>`);
for(const [lab, txt] of test) tb.append(html`<tr><td>test</td><td><span class="badge">${lab}</span></td><td>${txt}</td></tr>`);
tbl.append(tb);
sec.append(html`<div class="hint">Current model: <b>${modelDisplayName(opts)}</b>. เลือก task / split / seed แล้วกด <b>Shuffle & Split</b>.</div>`);
sec.append(tbl);
view.innerHTML=""; view.append(sec);
}
function renderToken(){
const {opts, train} = cache;
const sec = html`<div></div>`;
sec.append(html`<div class="sa-h">Step 2 — Tokenization</div>`);
sec.append(html`<div class="hint">Lowercase / punctuation / stopwords + <b>${opts.norm}</b>.</div>`);
const examples = train.slice(0, Math.min(6, train.length));
const tbl = html`<table class="tbl small"></table>`;
tbl.append(html`<thead><tr><th>Label</th><th>Original</th><th>Tokens</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
for(const [lab, txt] of examples){
const toks = tokenizeBase(txt, opts);
tb.append(html`<tr>
<td><span class="badge">${lab}</span></td>
<td>${txt}</td>
<td class="mono">${toks.join(" | ")}</td>
</tr>`);
}
tbl.append(tb);
sec.append(tbl);
view.innerHTML=""; view.append(sec);
}
function renderFeat(){
const {vecTrain, opts, train, test} = cache;
const sec = html`<div></div>`;
sec.append(html`<div class="sa-h">Step 3 — Features</div>`);
sec.append(html`<div class="hint">Tokens → numeric features (${opts.feat}).</div>`);
const freq = new Float64Array(vecTrain.V).fill(0);
for(const row of vecTrain.X){ for(const jStr in row){ const j=+jStr; freq[j]+=1; } }
const inv = []; vecTrain.vocab.forEach((j,w)=> inv[j]=w);
const order = Array.from({length:vecTrain.V},(_,j)=>j).sort((a,b)=>freq[b]-freq[a]).slice(0,Math.min(vecTrain.V, 20));
const chips = html`<div style="display:flex;gap:6px;flex-wrap:wrap;margin:6px 0 10px;"></div>`;
for(const j of order) chips.append(html`<span class="pill mono">${inv[j]}</span>`);
sec.append(html`<div>Top terms:</div>`, chips);
if(opts.showStruct){
const colIdx = order.slice(0, opts.topKCols);
const colNames = colIdx.map(j=>inv[j]);
const rows = [...train, ...test].slice(0, opts.rowsPreview);
const tbl = html`<table class="tbl small"></table>`;
const thead = html`<thead><tr><th>#</th><th>Label</th>${colNames.map(n=>html`<th class="mono">${n}</th>`)}</tr></thead>`;
const tb = html`<tbody></tbody>`;
function rowVector(text){
const toks = tokenizeBase(text, opts);
const counts = {};
for(const w of toks){ if(!vecTrain.vocab.has(w)) continue; const j=vecTrain.vocab.get(w); counts[j]=(counts[j]||0)+1; }
const n = toks.length || 1;
const row = {};
for(const jStr in counts){
const j = +jStr; let val = counts[j];
if(opts.feat==="TF") val = val / n;
if(opts.feat==="TF–IDF") val = (counts[j]/n) * (vecTrain.idf[j] || 1);
row[j] = val;
}
return row;
}
rows.forEach((r,i)=>{
const [lab, text] = r;
const rv = rowVector(text);
const tr = html`<tr><td>${i+1}</td><td><span class="badge">${lab}</span></td></tr>`;
for(const j of colIdx){ tr.append(html`<td class="mono">${(rv[j]||0).toFixed(3)}</td>`); }
tb.append(tr);
});
tbl.append(thead, tb);
sec.append(html`<div style="margin-top:8px"><b>Structural features (preview)</b> — rows = samples, cols = tokens.</div>`, tbl);
}
view.innerHTML=""; view.append(sec);
}
function renderTrain(){
const {model, vecTrain} = cache;
const sec = html`<div></div>`;
const modelName = "Multinomial Naive Bayes";
sec.append(html`<div class="sa-h">Step 4 — Train (${modelName})</div>`);
const inv = []; vecTrain.vocab.forEach((j,w)=> inv[j]=w);
const list = html`<div style="display:grid;grid-template-columns:repeat(${model.classes.length},1fr);gap:10px"></div>`;
for(let c=0;c<model.classes.length;c++){
const arr = Array.from({length:vecTrain.V},(_,j)=>[j, model.logLik[c][j]]).sort((a,b)=>b[1]-a[1]).slice(0,12);
const col = html`<div><div><b>${model.classes[c]}</b></div><div style="display:flex;flex-wrap:wrap;gap:6px;margin-top:6px"></div></div>`;
for(const [j,] of arr) col.lastChild.append(html`<span class="pill mono">${inv[j]}</span>`);
list.append(col);
}
sec.append(list);
view.innerHTML=""; view.append(sec);
}
function renderEval(){
const {classes, cm, acc, metrics} = cache;
const sec = html`<div></div>`;
sec.append(html`<div class="sa-h">Step 5 — Evaluate</div>`);
const tbl = html`<table class="tbl small"></table>`;
const thead = html`<thead><tr><th></th>${classes.map(l=>html`<th>Pred: ${l}</th>`)}</tr></thead>`;
const tb = html`<tbody></tbody>`;
for(let i=0;i<classes.length;i++){
const r = html`<tr><th>True: ${classes[i]}</th></tr>`;
for(let j=0;j<classes.length;j++) r.append(html`<td>${cm[i][j]}</td>`);
tb.append(r);
}
tbl.append(thead, tb);
const mt = html`<table class="tbl small" style="margin-top:8px"></table>`;
mt.append(html`<thead><tr><th>Class</th><th>Precision</th><th>Recall</th><th>F1</th></tr></thead>`);
const mb = html`<tbody></tbody>`;
for(const m of metrics){ mb.append(html`<tr><td>${m.label}</td><td>${m.prec.toFixed(3)}</td><td>${m.rec.toFixed(3)}</td><td>${m.f1.toFixed(3)}</td></tr>`); }
mt.append(mb);
sec.append(mt);
sec.append(html`<div style="margin-top:8px" class="hint">Overall accuracy: <b>${(acc*100).toFixed(1)}%</b></div>`);
view.innerHTML=""; view.append(sec);
}
// ---------- Helpers ----------
function makeRipple(btn){
const rect = btn.getBoundingClientRect();
const r = Math.max(rect.width, rect.height);
const s = html`<span class="ripple"></span>`;
s.style.width = s.style.height = r + "px";
s.style.left = (rect.width/2 - r/2) + "px";
s.style.top = (rect.height/2 - r/2) + "px";
btn.appendChild(s);
setTimeout(()=> s.remove(), 500);
}
function predictOnce(btn){
btn.classList.add("loading");
makeRipple(btn);
setTimeout(()=>{ btn.classList.remove("loading"); btn.classList.add("done"); setTimeout(()=>btn.classList.remove("done"), 400); }, 350);
predicted = true; renderApply();
}
// ---------- Step 6 — Apply (with tabset) ----------
function tokenContribNB(model, row){
const scores = model.logPrior.slice();
for(const jStr in row){ const j=+jStr, v=row[j]; for(let c=0;c<model.classes.length;c++) scores[c]+= v*model.logLik[c][j]; }
const order = scores.map((s,i)=>[s,i]).sort((a,b)=>b[0]-a[0]).map(d=>d[1]);
const win = order[0], alt = order[1] ?? order[0];
const contrib = {};
for(const jStr in row){
const j=+jStr, v=row[j];
contrib[j] = v*(model.logLik[win][j] - model.logLik[alt][j]);
}
return {win, alt, contrib};
}
function renderApply(){
const {opts, vecTrain, model, toRow} = cache;
const sec = html`<div></div>`;
sec.append(html`<div class="sa-h">Step 6 — Apply (Predict on new text)</div>`);
// --- Tabset UI
const tabs = html`<div class="tabs"></div>`;
const head = html`<div class="tabs-head"></div>`;
const body = html`<div class="tabs-body"></div>`;
tabs.append(head, body);
const TAB_TEXT = 0, TAB_SAMPLE = 1, TAB_EXPL = 2;
let active = TAB_TEXT;
function renderTabs(){
head.innerHTML="";
[["Text","✍️"],["Pick sample","📚"],["Explain options","🧩"]].forEach(([label,icon],i)=>{
const b = html`<button>${icon} ${label}</button>`;
if(i===active) b.classList.add("active");
b.onclick = ()=>{ active = i; renderBody(); renderTabs(); };
head.append(b);
});
}
function renderBody(){
body.innerHTML = "";
if(active===TAB_TEXT){
// TEXT
const ta = html`<textarea class="ta" placeholder="Type text here...">${applyTextVal}</textarea>`;
ta.oninput = ()=>{ applyTextVal = ta.value; predicted = false; };
const row = html`<div class="row"></div>`;
const predictBtn = html`<button class="pill btn-predict">Predict</button>`;
predictBtn.onclick = ()=> predictOnce(predictBtn);
row.append(html`<div class="col" style="flex:1;min-width:320px">${ta}</div>`, predictBtn);
body.append(row, html`<div class="hint" style="margin-top:6px">Press <b>Predict</b> to run inference. Output appears below.</div>`);
}
if(active===TAB_SAMPLE){
// SAMPLES
const sel = html`<select class="sel"></select>`;
const bank = SAMPLE_BANK[taskSel.value] || [];
bank.forEach(s=> sel.append(html`<option value="${s}">${s}</option>`));
const useBtn = html`<button class="pill">Use sample</button>`;
useBtn.onclick = ()=>{ applyTextVal = sel.value; predicted = false; renderApply(); };
const row = html`<div class="row"></div>`;
row.append(sel, useBtn);
body.append(row, html`<div class="hint" style="margin-top:6px">เลือก sample แล้วกด <b>Use sample</b>. จากนั้นไปที่แท็บ <b>Text</b> เพื่อกด Predict.</div>`);
}
if(active===TAB_EXPL){
// EXPLAIN OPTIONS
const rg = html`<input class="rg" type="range" min="5" max="20" step="1" value="${topKTokVal}"/>`;
const lab = html`<div class="hint">Max tokens to display (contrib): <b>${topKTokVal}</b></div>`;
rg.oninput = () => {
topKTokVal = +rg.value;
lab.innerHTML = `<div class="hint">Max tokens to display (contrib): <b>${topKTokVal}</b></div>`;
predicted = false;
};
body.append(html`<div class="row"><div class="col">${lab}${rg}</div></div>`);
}
}
renderTabs(); renderBody();
sec.append(tabs);
// Preview text + prediction/output
const textPreview = html`<div class="callout" style="margin-top:10px"><b>Text to classify</b><div class="mono" style="margin-top:6px;white-space:pre-wrap">${applyTextVal || "(empty)"}</div><div class="hint" style="margin-top:6px">${predicted?"":"Press <b>Predict</b> in the Text tab to run inference."}</div></div>`;
const out = html`<div style="margin-top:10px"></div>`;
sec.append(textPreview, out);
view.innerHTML=""; view.append(sec);
if(!predicted) return;
const text = applyTextVal || "";
const {row} = toRow(text);
const inv = []; vecTrain.vocab.forEach((j,w)=> inv[j]=w);
const pred = predictNB(model,row);
const explain = tokenContribNB(model,row);
const probsLine = model.classes.map((c,i)=> `${c}: ${(pred.probs[i]*100).toFixed(1)}%`).join(" · ");
const headPred = html`<div style="margin:10px 0"><b>Prediction:</b> <span class="badge">${pred.label}</span> <span class="hint mono" style="margin-left:8px">${probsLine}</span></div>`;
const spans = html`<div class="tokens"></div>`;
const tokMap = {};
for(const jStr in row){ const j=+jStr; tokMap[inv[j]] = explain.contrib[j]||0; }
const rendered = [];
for(const t of tokenizeBase(text, currentOptions())){
if(rendered.length >= topKTokVal) break;
if(!(t in tokMap)) continue;
const d = tokMap[t]||0;
const span = html`<span class="tok mono" title="${(d>=0?'+':'')+d.toFixed(3)}">${t}</span>`;
if(d>=0) span.classList.add("pos"); else span.classList.add("neg");
spans.append(span);
rendered.push(t);
}
const nz = Object.entries(row).map(([jStr,v])=>({j:+jStr, v}));
nz.sort((a,b)=>Math.abs(b.v)-Math.abs(a.v));
const top = nz.slice(0, 15);
const tbl = html`<table class="tbl small" style="margin-top:8px"></table>`;
tbl.append(html`<thead><tr><th>#</th><th>Feature (token)</th><th>Value</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
top.forEach((e,i)=> tb.append(html`<tr><td>${i+1}</td><td class="mono">${inv[e.j]}</td><td class="mono">${e.v.toFixed(3)}</td></tr>`));
tbl.append(tb);
out.innerHTML = "";
out.append(headPred, html`<div class="legend" style="margin:6px 0 10px"><div class="boxpos"></div><div class="small">token contributes to predicted class</div><div class="boxneg"></div><div class="small">token pushes to alternative</div></div>`, html`<div><b>Tokens (top by contribution)</b></div>`, spans, html`<div style="margin-top:8px"><b>Vector preview (non-zeros)</b></div>`, tbl);
}
// ===================== Per-step offsets (keep Step 6 aligned with Step 4) =====================
function setStepOffsets(){
kpi.style.transform = "translateY(-4px)";
view.style.marginTop = "0px";
if(step === 6){
kpi.style.transform = "translateY(-6px)";
view.style.marginTop = "-14px";
}
}
// ===================== Wiring =====================
function refreshView(){
refreshModelName();
refreshKPI();
kpi.style.borderTop = `4px solid ${stepColors[step-1]}`;
if(step===1) renderData();
if(step===2) renderToken();
if(step===3) renderFeat();
if(step===4) renderTrain();
if(step===5) renderEval();
if(step===6) renderApply();
renderStepper();
setStepOffsets();
}
function fullResetAndRefresh(){
computeAll();
predicted = false;
refreshView();
}
computeAll();
refreshView();
// listeners
[taskSel, splitSlider, seedInput,
lowerCk, punctCk, stopCk, ngramSel, normSel,
featSel, showStruct, topKCols, rowsPreview,
alphaRg, alphaMode, alphaList
].forEach(el=> el.addEventListener("input", ()=>{ fullResetAndRefresh(); }));
resplitBtn.onclick = fullResetAndRefresh;
btnFindBest.onclick = ()=>{ predicted=false; fullResetAndRefresh(); };
return box;
})();(async () => {
// ===================== Layout & Styles =====================
const box = html`<div style="max-width:1120px;font:14px system-ui, -apple-system, Segoe UI, Roboto, sans-serif; color:#0f172a;">
<style>
.wrap{display:grid;grid-template-columns:360px 1fr;gap:14px}
.side{border:1px solid #cbd5e1;border-radius:12px;padding:12px;background:#f8fafc}
.main{display:grid;gap:12px}
.h{font-weight:700;margin:4px 0 6px}
.group{border:1px dashed #cbd5e1;border-radius:10px;padding:10px;background:#fff;margin-bottom:10px}
.group>.title{font-weight:700;margin-bottom:6px}
.stepper{display:flex;gap:4px;flex-wrap:wrap}
.stepper button{padding:6px 10px;border:1px solid #cbd5e1;border-radius:12px;background:#fff;cursor:pointer;transition:all .15s}
.stepper button.active{color:#fff;border-color:transparent}
.kpi{display:grid;grid-template-columns:repeat(5,minmax(0,1fr));gap:8px}
.k{border:1px solid #cbd5e1;border-radius:12px;background:#fff;padding:10px}
.k b{display:block;font-size:18px;margin-bottom:4px}
.tbl{border-collapse:collapse;width:100%}
.tbl th,.tbl td{border:1px solid #cbd5e1;padding:6px 8px;text-align:center;vertical-align:top}
.small{font-size:12px}
.mono{font-family:ui-monospace,SFMono-Regular,Menlo,Consolas,monospace}
.pill{display:inline-block;padding:2px 8px;border-radius:999px;border:1px solid #cbd5e1;background:#fff}
.legend{display:flex;flex-wrap:wrap;gap:8px;margin:6px 0}
.dot{display:inline-block;width:10px;height:10px;border-radius:50%;margin-right:6px;vertical-align:middle}
.plot{border:1px solid #cbd5e1;border-radius:12px;background:#fff;overflow:hidden}
.hint{color:#475569;font-size:12px}
</style>
<div class="wrap">
<div class="side">
<div class="h">Controls (Clustering)</div>
<div class="group">
<div class="title">1) Data</div>
<div id="ctrl-data"></div>
</div>
<div class="group">
<div class="title">2) Tokenization</div>
<div id="ctrl-token"></div>
</div>
<div class="group">
<div class="title">3) Features</div>
<div id="ctrl-feature"></div>
</div>
<div class="group">
<div class="title">4) Clustering</div>
<div id="ctrl-cluster"></div>
</div>
<div class="group">
<div class="title">5) Evaluate</div>
<div id="ctrl-eval"></div>
</div>
</div>
<div class="main">
<div class="stepper" id="steps"></div>
<div id="kpi" class="kpi"></div>
<div id="view"></div>
</div>
</div>
</div>`;
const steps = box.querySelector("#steps");
const kpi = box.querySelector("#kpi");
const view = box.querySelector("#view");
const stepColors = ["#2563eb","#059669","#d97706","#7c3aed","#dc2626"];
const stepNames = ["1) Data","2) Tokenization","3) Features","4) Cluster","5) Evaluate"];
let step = 1;
function renderStepper(){
steps.innerHTML = "";
stepNames.forEach((name,i)=>{
const b = html`<button>${name}</button>`;
const c = stepColors[i];
b.style.borderColor = c + "55";
if(i+1===step){ b.classList.add("active"); b.style.background = c; }
else { b.onmouseenter = ()=> b.style.background = c+"22"; b.onmouseleave = ()=> b.style.background = "#fff"; }
b.onclick = ()=>{ step=i+1; refresh(); };
steps.append(b);
});
}
// ===================== Datasets =====================
const CORPUS = {
"Mini Reviews":[
"I absolutely loved the movie — funny, charming, and moving.",
"The plot was predictable, but the soundtrack was excellent.",
"Unfunny jokes and slow pacing made it boring.",
"A touching finale that left me smiling.",
"Average overall; strong visuals but uneven writing.",
"Battery lasts two days; camera is sharp but performance stutters.",
"Disappointing — the screen cracked and support was unhelpful.",
"Great build quality and the speakers are loud.",
"My account was charged twice this month; please refund.",
"The app crashes whenever I upload photos."
],
"Support Emails":[
"I can't reset my password after the update.",
"How can I change the email associated with my account?",
"The invoice download button is missing.",
"Please help—I'm charged twice for the same order.",
"Upload crashes on iOS 17 when selecting multiple photos.",
"Shipping address change request for order #4312.",
"Refund was approved but I didn't receive the money.",
"Two-factor code never arrives to my phone.",
"Attachment upload stalls at 95 percent.",
"Pricing page shows old plan tiers."
],
"Mixed Topics":[
"The striker scored a late winner in extra time.",
"Parliament passed the new budget after a long debate.",
"Researchers unveiled a prototype quantum processor.",
"The coach announced the final squad for the cup.",
"The minister discussed trade policy with reporters.",
"The company released a major update to its OS.",
"Team signed a veteran defender on a two-year deal.",
"Senate will vote on the climate package next week.",
"Startups race to build faster AI accelerators.",
"Forward missed training due to a minor injury."
]
};
// ===================== Controls =====================
const ctrlData = box.querySelector("#ctrl-data");
const ctrlToken = box.querySelector("#ctrl-token");
const ctrlFeature = box.querySelector("#ctrl-feature");
const ctrlCluster = box.querySelector("#ctrl-cluster");
const ctrlEval = box.querySelector("#ctrl-eval");
// Data
const corpSel = Inputs.select(Object.keys(CORPUS), {label:"Preset corpus"});
const areaDocs = html`<textarea style="width:100%;min-height:110px;border:1px solid #cbd5e1;border-radius:8px;padding:8px;font:13px/1.4 ui-monospace,SFMono-Regular,Menlo,Consolas,monospace" placeholder="One document per line..."></textarea>`;
const loadBtn = html`<button class="pill">Load preset</button>`;
ctrlData.append(corpSel, areaDocs, html`<div style="margin-top:6px"></div>`);
ctrlData.lastChild.append(loadBtn);
function setDocs(lines){ areaDocs.value = lines.join("\n"); }
setDocs(CORPUS[corpSel.value]);
loadBtn.onclick = ()=> setDocs(CORPUS[corpSel.value]);
// Tokenization
const lowerCk = Inputs.toggle({label:"Lowercase", value:true});
const punctCk = Inputs.toggle({label:"Remove punctuation", value:true});
const stopCk = Inputs.toggle({label:"Remove stopwords (EN)", value:false});
const normSel = Inputs.radio(["None","Stemming","Lemmatization"], {label:"Normalization", value:"None"});
const ngramSel= Inputs.select(["1 (unigram)","1–2 (uni+bi)"], {label:"N-grams", value:"1 (unigram)"});
ctrlToken.append(lowerCk, punctCk, stopCk, normSel, ngramSel);
// Features
const featSel = Inputs.radio(["TF","TF–IDF"], {label:"Feature type", value:"TF–IDF"});
const l2Ck = Inputs.toggle({label:"L2 normalize rows (recommended)", value:true});
ctrlFeature.append(featSel, l2Ck);
// Clustering
const algoSel = Inputs.radio(["K-Means","Agglomerative (single)"], {label:"Algorithm", value:"K-Means"});
const kRange = Inputs.range([2, 10], {label:"k (clusters)", value:3, step:1});
const seedNum = Inputs.number({label:"Random seed", value:42, step:1, min:1, max:999999});
ctrlCluster.append(algoSel, kRange, seedNum);
// Evaluate
const showTop = Inputs.range([3, 15], {label:"Top terms per cluster", value:8, step:1});
ctrlEval.append(showTop);
// ===================== NLP helpers =====================
const EN_STOP = new Set([
"a","an","and","are","as","at","be","but","by","for","if","in","into","is","it","its",
"of","on","or","such","t","that","the","their","then","there","these","they","this",
"to","was","will","with","you","your","i","me","my","we","our","he","she","them","his","her"
]);
function stem(w){
if(w.length<=3) return w;
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/s$/.test(w) && !/ss$/.test(w)) w = w.replace(/s$/,"");
if(/eed$/.test(w)) return w.replace(/eed$/,"ee");
if(/(ed|ing)$/.test(w)){
let base = w.replace(/(ed|ing)$/,"");
if(/(at|bl|iz)$/.test(base)) return base + "e";
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) return base.slice(0,-1);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
const LEMMA_EX = new Map(Object.entries({
better:"good", best:"good", worse:"bad", worst:"bad",
did:"do", does:"do", done:"do", doing:"do",
has:"have", had:"have",
engaging:"engage", making:"make", running:"run", studied:"study",
children:"child", mice:"mouse", people:"person", batteries:"battery", stories:"story"
}));
function undoubleLast(s){ return s.replace(/(bb|dd|ff|gg|ll|mm|nn|pp|rr|tt)$/,"$1".slice(0,1)); }
function addSilentE(s){ return s + "e"; }
function lemma(w){
if(w.length<=2) return w;
if(LEMMA_EX.has(w)) return LEMMA_EX.get(w);
if(/ies$/.test(w) && w.length>4) return w.replace(/ies$/,"y");
if(/sses$/.test(w)) return w.replace(/sses$/,"ss");
if(/xes$|zes$|ches$|shes$/.test(w)) return w.replace(/es$/,"");
if(/s$/.test(w) && !/ss$/.test(w) && !/(us|is)$/.test(w)) return w.replace(/s$/,"");
if(/ied$/.test(w) && w.length>4) return w.replace(/ied$/,"y");
if(/ed$/.test(w) && w.length>3){
let base = w.replace(/ed$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ing$/.test(w) && w.length>4){
let base = w.replace(/ing$/,"");
if(/(bb|dd|gg|mm|nn|pp|rr|tt)$/.test(base)) base = undoubleLast(base);
if(!/[aeiou]$/.test(base)) base = addSilentE(base);
return base;
}
if(/ly$/.test(w) && w.length>4) return w.replace(/ly$/,"");
return w;
}
function tokenize(text){
let t = lowerCk.value ? text.toLowerCase() : text;
if(punctCk.value) t = t.replace(/[\u2013\u2014]/g," ").replace(/[^a-zA-Z0-9' ]+/g," ");
let toks = t.trim().split(/\s+/).filter(Boolean);
if(stopCk.value) toks = toks.filter(w=>!EN_STOP.has(w));
if(normSel.value==="Stemming") toks = toks.map(stem);
if(normSel.value==="Lemmatization") toks = toks.map(lemma);
if(ngramSel.value==="1–2 (uni+bi)"){
const bi=[]; for(let i=0;i+1<toks.length;i++) bi.push(toks[i]+"_"+toks[i+1]);
toks = toks.concat(bi);
}
return toks;
}
// Vectorizer (TF / TF-IDF) -> dense matrix (L2 optional)
function vectorize(docs){
const tokDocs = docs.map(tokenize);
const vocab = new Map(); let vid=0;
const df = new Map();
tokDocs.forEach(toks=>{
const uniq = new Set(toks);
toks.forEach(w=>{ if(!vocab.has(w)) vocab.set(w, vid++); });
uniq.forEach(w=> df.set(w, (df.get(w)||0)+1));
});
const V = vocab.size, N = tokDocs.length;
const idf = new Float64Array(V);
if(featSel.value==="TF–IDF"){
vocab.forEach((j,w)=>{ idf[j] = Math.log((N+1)/((df.get(w)||0)+1)) + 1; });
} else {
vocab.forEach((j)=> idf[j] = 1.0);
}
const X = new Array(N).fill(0).map(()=> new Float64Array(V).fill(0));
tokDocs.forEach((toks,i)=>{
const counts = new Map();
toks.forEach(w=>{ const j=vocab.get(w); if(j!=null) counts.set(j, (counts.get(j)||0)+1); });
const n = toks.length || 1;
counts.forEach((c,j)=>{
let tf = c/n; // TF & TF–IDF both use tf=c/n; idf toggles above
X[i][j] = tf * idf[j];
});
});
if(l2Ck.value){
for(let i=0;i<N;i++){
let s=0; for(let j=0;j<V;j++) s += X[i][j]*X[i][j];
const z = Math.sqrt(s)||1; for(let j=0;j<V;j++) X[i][j]/=z;
}
}
return {X, vocab, tokDocs};
}
// ===================== Clustering algos =====================
function rand(seed){ let s=seed>>>0; return ()=> (s = (s*1664525 + 1013904223)>>>0) / 0xffffffff; }
function kmeans(X, k, seed=42, iters=50){
const rnd = rand(seed);
const n = X.length, d = X[0].length;
const centers = Array.from({length:k}, ()=> X[Math.floor(rnd()*n)].slice());
const labels = new Array(n).fill(0);
for(let it=0; it<iters; it++){
let moved=false;
// assign
for(let i=0;i<n;i++){
let best=0, bestDist=Infinity;
for(let c=0;c<k;c++){
let dist=0; const mu=centers[c], xi=X[i];
for(let j=0;j<d;j++){ const dv=xi[j]-mu[j]; dist+=dv*dv; }
if(dist<bestDist){ bestDist=dist; best=c; }
}
if(labels[i]!==best){ labels[i]=best; moved=true; }
}
if(!moved && it>0) break;
// update
const sum = Array.from({length:k}, ()=> new Float64Array(d).fill(0));
const cnt = new Array(k).fill(0);
for(let i=0;i<n;i++){ const c=labels[i]; cnt[c]++; for(let j=0;j<d;j++) sum[c][j]+=X[i][j]; }
for(let c=0;c<k;c++){ if(cnt[c]===0) continue; for(let j=0;j<d;j++) centers[c][j]=sum[c][j]/cnt[c]; }
}
return labels;
}
// Agglomerative (single-linkage) — simple/naive for small demo
function agglomerativeSingle(X, k){
const n = X.length;
const clusters = Array.from({length:n}, (_,i)=> [i]);
function dist(i,j){ let s=0; for(let a=0;a<X[i].length;a++){ const dv=X[i][a]-X[j][a]; s+=dv*dv; } return Math.sqrt(s); }
while(clusters.length>k){
let best=[0,1,Infinity];
for(let a=0;a<clusters.length;a++){
for(let b=a+1;b<clusters.length;b++){
let d=Infinity;
for(const i of clusters[a]) for(const j of clusters[b]) d=Math.min(d, dist(i,j));
if(d<best[2]) best=[a,b,d];
}
}
const [ai,bi] = best;
clusters[ai] = clusters[ai].concat(clusters[bi]);
clusters.splice(bi,1);
}
const labels = new Array(n).fill(0);
clusters.forEach((arr,ci)=> arr.forEach(i=> labels[i]=ci));
return labels;
}
// Silhouette (cosine if L2, else Euclidean)
function silhouette(X, labels){
const n = X.length; if(n<3) return 0;
const k = Math.max(...labels)+1; if(k<2) return 0;
function dot(a,b){ let s=0; for(let j=0;j<a.length;j++) s+=a[j]*b[j]; return s; }
function dist(i,j){
if(l2Ck.value){
// rows L2-normalized → cosine similarity ~ dot
const sim = dot(X[i],X[j]); return 1 - sim;
} else {
let s=0; for(let d=0;d=X[i].length,d< X[i].length;d++){ const dv=X[i][d]-X[j][d]; s+=dv*dv; } return Math.sqrt(s);
}
}
let sum=0;
for(let i=0;i<n;i++){
const ci = labels[i];
// a(i)
let a=0, ac=0;
for(let j=0;j<n;j++) if(i!==j && labels[j]===ci){ a+=dist(i,j); ac++; }
a = ac? a/ac : 0;
// b(i)
let b=Infinity;
for(let c=0;c<k;c++) if(c!==ci){
let s=0, sc=0;
for(let j=0;j<n;j++) if(labels[j]===c){ s+=dist(i,j); sc++; }
if(sc) b=Math.min(b, s/sc);
}
const s = (b-a) / Math.max(a,b||1e-9);
sum += isFinite(s)? s: 0;
}
return sum/n;
}
// Random projection → 2D for plotting
function project2D(X, seed=17){
let r = (s=>()=> (s = (s*1664525 + 1013904223)>>>0) / 0xffffffff)(seed>>>0);
const d = X[0].length;
const R = Array.from({length:d}, ()=> [r()-0.5, r()-0.5]);
const P = X.map(row=>{
let x=0,y=0; for(let j=0;j<d;j++){ x += row[j]*R[j][0]; y += row[j]*R[j][1]; }
return [x,y];
});
const xs=P.map(p=>p[0]), ys=P.map(p=>p[1]);
const minx=Math.min(...xs), maxx=Math.max(...xs), miny=Math.min(...ys), maxy=Math.max(...ys);
const nx=v=> (maxx-minx===0? .5 : (v-minx)/(maxx-minx));
const ny=v=> (maxy-miny===0? .5 : (v-miny)/(maxy-miny));
return P.map(([x,y])=> [nx(x), ny(y)]);
}
// ===================== Pipeline compute =====================
let cache=null;
function buildState(){
const docs = areaDocs.value.split(/\r?\n/).map(s=>s.trim()).filter(Boolean);
const {X, vocab, tokDocs} = vectorize(docs);
// cluster
const k = kRange.value|0;
const labels0 = (algoSel.value==="K-Means")
? kmeans(X, k, seedNum.value|0, 60)
: agglomerativeSingle(X, k);
// 1-based labels for display
const labels = labels0.map(c => c+1);
// projection
const P = project2D(X, 17);
// top terms per cluster (by mean value)
const inv = []; vocab.forEach((j,w)=> inv[j]=w);
const d = X[0]?.length||0, n=X.length;
const clusterIdx = Array.from({length:k}, ()=> []);
labels.forEach((c,i)=> clusterIdx[c-1]?.push(i)); // store 0-based internally
const topTerms = [];
for(let c=0;c<k;c++){
const rows = clusterIdx[c];
const mean = new Float64Array(d).fill(0);
if(rows.length){
rows.forEach(i=>{ for(let j=0;j<d;j++) mean[j]+=X[i][j]; });
for(let j=0;j<d;j++) mean[j]/=rows.length;
}
const order = Array.from({length:d},(_,j)=>j).sort((a,b)=> mean[b]-mean[a]).slice(0, 40);
topTerms[c] = order.map(j=> inv[j]);
}
// silhouette
const sil = silhouette(X, labels0); // metric uses 0-based
cache = {docs, X, vocab, tokDocs, labels, labels0, k, P, clusterIdx, topTerms, sil, inv};
}
// ===================== Views =====================
function renderKPI(){
const {docs, vocab, k, sil, clusterIdx} = cache;
kpi.innerHTML = "";
const makeK = (title, val, sub="") => html`<div class="k"><b>${val}</b><div>${title}</div>${sub?html`<div class="hint">${sub}</div>`:""}</div>`;
kpi.append(
makeK("Documents", docs.length),
makeK("Vocab size", vocab.size),
makeK("Algorithm", algoSel.value),
makeK("k (clusters)", k),
makeK("Silhouette", isFinite(sil)? sil.toFixed(3) : "—")
);
}
function renderData(){
const sec = html`<div></div>`;
sec.append(html`<div class="h">Step 1 — Data</div>`);
const tbl = html`<table class="tbl small"></table>`;
tbl.append(html`<thead><tr><th>#</th><th>Text</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
cache.docs.forEach((t,i)=> tb.append(html`<tr><td>${i+1}</td><td>${t}</td></tr>`));
tbl.append(tb);
sec.append(tbl, html`<div class="hint" style="margin-top:6px">Edit corpus in the left panel (one document per line)</div>`);
view.innerHTML=""; view.append(sec);
}
function renderToken(){
const sec = html`<div></div>`;
sec.append(html`<div class="h">Step 2 — Tokenization</div>`);
const tbl = html`<table class="tbl small"></table>`;
tbl.append(html`<thead><tr><th>#</th><th>Original</th><th>Tokens (preview)</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
cache.docs.slice(0,Math.min(6,cache.docs.length)).forEach((t,i)=>{
const toks = tokenize(t);
tb.append(html`<tr><td>${i+1}</td><td>${t}</td><td class="mono">${toks.slice(0,24).join(" | ")}${toks.length>24? " …": ""}</td></tr>`);
});
tbl.append(tb);
sec.append(html`<div class="hint">Lowercase / punctuation / stopwords + ${normSel.value}; N-grams: ${ngramSel.value}</div>`);
sec.append(tbl);
view.innerHTML=""; view.append(sec);
}
function renderFeat(){
const sec = html`<div></div>`;
sec.append(html`<div class="h">Step 3 — Features</div>`);
sec.append(html`<div class="hint">Feature: <b>${featSel.value}</b> ${l2Ck.value? "+ L2 row-norm (for cosine)": ""}</div>`);
// Top tokens by document frequency (chips)
const V = cache.vocab.size;
const df = new Float64Array(V).fill(0);
cache.X.forEach(row=>{ for(let j=0;j<V;j++) if(row[j]>0) df[j]++; });
const order = Array.from({length:V},(_,j)=>j).sort((a,b)=> df[b]-df[a]);
const inv = cache.inv;
const chips = html`<div style="display:flex;flex-wrap:wrap;gap:6px;margin:8px 0"></div>`;
order.slice(0, Math.min(25,V)).forEach(j=> chips.append(html`<span class="pill mono">${inv[j]}</span>`));
sec.append(html`<div>Top tokens (by document frequency):</div>`, chips);
// === Tabular preview (docs × top tokens) ===
const topCols = order.slice(0, Math.min(12, V));
const maxRows = Math.min(10, cache.docs.length);
const tbl = html`<table class="tbl small" style="margin-top:6px"></table>`;
const thead = html`<thead><tr><th>#</th><th style="text-align:left">Doc (preview)</th>${topCols.map(j=> html`<th class="mono">${inv[j]}</th>`)}</tr></thead>`;
const tb = html`<tbody></tbody>`;
for(let i=0;i<maxRows;i++){
const tr = html`<tr><td>${i+1}</td><td style="text-align:left">${cache.docs[i].slice(0,80)}</td></tr>`;
for(const j of topCols) tr.append(html`<td class="mono">${cache.X[i][j].toFixed(3)}</td>`);
tb.append(tr);
}
tbl.append(thead, tb);
sec.append(html`<div style="margin-top:6px"><b>Structural features (tabular preview)</b> — rows = documents, columns = selected tokens</div>`, tbl);
view.innerHTML=""; view.append(sec);
}
function renderCluster(){
const sec = html`<div></div>`;
sec.append(html`<div class="h">Step 4 — Cluster</div>`);
const colors = ["#2563eb","#059669","#d97706","#7c3aed","#ef4444","#10b981","#f59e0b","#8b5cf6","#06b6d4","#e11d48"];
const legend = html`<div class="legend"></div>`;
for(let c=0;c<cache.k;c++){
const dot = html`<span class="dot" style="background:${colors[c%colors.length]}"></span>`;
// display cluster number starting at 1
legend.append(html`<span>${dot} Cluster ${c+1} (${cache.clusterIdx[c].length})</span>`);
}
sec.append(legend);
// scatter plot
const svg = html`<svg class="plot" width="100%" height="360"></svg>`;
const W=svg.clientWidth||900, H=360, pad=24;
const g = d3.select(svg).append("g");
g.append("rect").attr("x",0).attr("y",0).attr("width","100%").attr("height","100%").attr("fill","#fff");
g.append("rect").attr("x",pad).attr("y",pad).attr("width",W-2*pad).attr("height",H-2*pad).attr("fill","none").attr("stroke","#e2e8f0");
cache.P.forEach(([x,y],i)=>{
const cx = pad + x*(W-2*pad);
const cy = pad + (1-y)*(H-2*pad);
g.append("circle")
.attr("cx",cx).attr("cy",cy).attr("r",4)
.attr("fill", colors[(cache.labels[i]-1)%colors.length]) // labels are 1-based; color index 0-based
.attr("opacity",0.9);
});
sec.append(svg);
// top terms per cluster (table)
const tbl = html`<table class="tbl small" style="margin-top:10px"></table>`;
tbl.append(html`<thead><tr><th>Cluster</th><th>Top terms</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
for(let c=0;c<cache.k;c++){
const terms = cache.topTerms[c].slice(0, showTop.value|0).map(w=> `<span class="pill mono">${w}</span>`).join(" ");
tb.append(html`<tr><td>${c+1} <span class="hint">(${cache.clusterIdx[c].length})</span></td><td>${html`${terms}`}</td></tr>`);
}
tbl.append(tb);
sec.append(tbl);
view.innerHTML=""; view.append(sec);
}
function renderEval(){
const sec = html`<div></div>`;
sec.append(html`<div class="h">Step 5 — Evaluate</div>`);
const {sil, k, clusterIdx, labels} = cache;
// Cluster sizes (cards)
const list = html`<div style="display:grid;grid-template-columns:repeat(${k}, minmax(0,1fr)); gap:8px"></div>`;
for(let c=0;c<k;c++){
const box = html`<div class="k"><b>Cluster ${c+1}</b><div>${clusterIdx[c].length} docs</div></div>`;
list.append(box);
}
sec.append(list, html`<div class="hint" style="margin-top:8px">Silhouette score (−1..1, higher is better): <b>${isFinite(sil)? sil.toFixed(3): "—"}</b></div>`);
// === NEW: Document → Cluster table ===
const tbl = html`<table class="tbl small" style="margin-top:10px"></table>`;
tbl.append(html`<thead><tr><th>#</th><th>Cluster</th><th style="text-align:left">Text</th></tr></thead>`);
const tb = html`<tbody></tbody>`;
cache.docs.forEach((t,i)=>{
tb.append(html`<tr><td>${i+1}</td><td>${labels[i]}</td><td style="text-align:left">${t}</td></tr>`);
});
tbl.append(tb);
sec.append(html`<div style="margin-top:6px"><b>Assignments</b> — which document belongs to which cluster</div>`, tbl);
view.innerHTML=""; view.append(sec);
}
function refresh(){
renderStepper();
renderKPI();
if(step===1) renderData();
if(step===2) renderToken();
if(step===3) renderFeat();
if(step===4) renderCluster();
if(step===5) renderEval();
}
function recompute(){
buildState();
refresh();
}
// Initial compute & wire
buildState(); renderStepper(); renderKPI(); renderData();
[
corpSel, loadBtn, areaDocs,
lowerCk, punctCk, stopCk, normSel, ngramSel,
featSel, l2Ck,
algoSel, kRange, seedNum,
showTop
].forEach(el => el.addEventListener("input", recompute));
return box;
})();Bag of Words (Count)
What it is: Represents each document by raw token counts.
How to compute (per document \(d\), token \(t\)):
Tokenize text (lowercasing, punctuation removal, stopword filtering, stemming/lemmatization as configured).
Build a vocabulary of unique tokens.
For each token \(t\) in \(d\), set the feature value to the count: \[\text{BoW}(t,d) = \#\text{occurrences of } t \text{ in } d\]
Use when: You want a simple baseline and your documents are short and similarly sized.
Term Frequency (TF)
What it is: Normalizes raw counts by the document’s length, so longer docs don’t dominate just because they have more words.
How to compute: \[\text{TF}(t,d) = \frac{\#(t \text{ in } d)}{\sum_{w}\#(w \text{ in } d)}\]
i.e., count of (t) divided by total tokens in (d).
Use when: You need length-invariance across documents and care about within-document salience.
TF–IDF (Term Frequency–Inverse Document Frequency)
What it is: Downweights tokens that appear in many documents (e.g., “movie”, “phone”, “issue”) and upweights tokens that are distinctive for a document (e.g., “thrilling”, “refund”).
How to compute (common variant):
TF as above.
IDF (document rarity): \[\text{IDF}(t) = \log\frac{N}{\text{df}(t)} \quad\text{or}\quad \log\Big(\frac{N+1}{\text{df}(t)+1}\Big) + 1 ~(\text{smoothing})\]
where (N) = number of documents, ((t)) = number of documents containing (t).
TF–IDF: \[\text{TF–IDF}(t,d) = \text{TF}(t,d)\times \text{IDF}(t)\]
Use when: You want to emphasize discriminative words and reduce the weight of ubiquitous words.
L2 row normalization (recommended for cosine similarity): After computing BoW/TF/TF–IDF, scale each document vector \((x_d)\) so \(||x_d||_2 = 1\).
N-grams (1–2): In addition to unigrams (single tokens), include bigrams like very_good. This captures short phrases and simple word order patterns.
Stemming / Lemmatization: Collapses inflected forms (running, runs → run; better → good with custom lemma map). This reduces sparsity and can improve generalization.
Stopword removal: Drops extremely common function words (“the”, “and”), which rarely help classification.
Corpus (N=2):
\(d_1\): “good movie good acting”
\(d_2\): “bad movie bad plot”
Vocabulary: {good, movie, acting, bad, plot}
BoW (counts)
\(d_1\): {good:2, movie:1, acting:1, bad:0, plot:0}
\(d_2\): {good:0, movie:1, acting:0, bad:2, plot:1}
TF (divide by doc length; each doc length = 4)
IDF (plain; df: movie=2, others=1)
TF–IDF
\(d_1\): {good:0.5×0.693=0.347, movie:0, acting:0.25×0.693=0.173, bad:0, plot:0}
\(d_2\): {bad:0.5×0.693=0.347, movie:0, plot:0.25×0.693=0.173, good:0, acting:0}
Notice how “movie” (present in both docs) gets 0 weight with this IDF, while distinctive words keep weight.
BoW: simplest; can work okay with strong models and lots of data, but is length-biased.
TF: normalizes for length; good general baseline.
TF–IDF: usually best for traditional text classification and search ranking when you don’t use pretrained embeddings.
\(t\) = a term (aka token or feature). Examples: good, movie, running, or a bigram like very_good.
\(d\) = a document (one text item in your corpus). Examples: one review, one email, one tweet, one sentence—whatever unit you choose.
Quick glossary (ties it together):
Tiny example:
Documents:
\(d_1\): “good movie good acting”
\(d_2\): “bad movie bad plot”
Terms \(t\) (vocabulary): {good, movie, acting, bad, plot}
So when you see \(\text{TF}(t,d)\), just read it as: “the TF value for term \(t\) in document \(d\).”