levineuwirth.org/static/js/semantic-search.js

221 lines
8.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* semantic-search.js — Client-side semantic search using paragraph embeddings.
*
* At build time, tools/embed.py produces:
* /data/semantic-index.bin raw Float32Array (N_paragraphs × 384 dims)
* /data/semantic-meta.json [{url, title, heading, excerpt}, ...]
*
* At query time, transformers.js embeds the user's query with all-MiniLM-L6-v2
* (same model used at build time) and ranks paragraphs by cosine similarity.
* All computation is client-side; no server required.
*
* Model: Xenova/all-MiniLM-L6-v2 (~22 MB quantized, cached by browser after first load)
* Model files served from /models/all-MiniLM-L6-v2/ (same-origin; run tools/download-model.sh)
* Index format: raw little-endian Float32, shape [N, 384], unit-normalized
*
* CSP: requires cdn.jsdelivr.net in script-src (transformers.js library).
* connect-src stays 'self' — model weights are served same-origin.
*/
(function () {
'use strict';
var MODEL = 'all-MiniLM-L6-v2'; /* local name, no Xenova/ prefix */
var MODEL_PATH = '/models/'; /* served same-origin */
var DIM = 384;
var TOP_K = 8;
var CDN = 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2';
var extractor = null; /* loaded lazily on first search */
var vectors = null; /* Float32Array, shape [N, DIM] */
var meta = null; /* [{url, title, heading, excerpt}] */
var indexReady = false;
var queryEl = document.getElementById('semantic-query');
var statusEl = document.getElementById('semantic-status');
var resultsEl = document.getElementById('semantic-results');
if (!queryEl) return; /* not on the search page */
/* ------------------------------------------------------------------
Index loading — fetch once, lazily
------------------------------------------------------------------ */
function loadIndex() {
if (indexReady) return Promise.resolve();
return Promise.all([
fetch('/data/semantic-index.bin').then(function (r) {
if (!r.ok) throw new Error('semantic-index.bin not found');
return r.arrayBuffer();
}),
fetch('/data/semantic-meta.json').then(function (r) {
if (!r.ok) throw new Error('semantic-meta.json not found');
return r.json();
}),
]).then(function (results) {
vectors = new Float32Array(results[0]);
meta = results[1];
indexReady = true;
});
}
/* ------------------------------------------------------------------
Model loading — dynamic import from CDN, lazy
------------------------------------------------------------------ */
function loadModel() {
if (extractor) return Promise.resolve(extractor);
setStatus('Loading model…');
return import(CDN).then(function (mod) {
/* Point transformers.js at our self-hosted model files. */
mod.env.localModelPath = MODEL_PATH;
mod.env.allowRemoteModels = false;
return mod.pipeline('feature-extraction', MODEL, { quantized: true });
}).then(function (pipe) {
extractor = pipe;
return extractor;
});
}
/* ------------------------------------------------------------------
Search
------------------------------------------------------------------ */
function cosineSims(queryVec) {
/* queryVec is already unit-normalized; dot product = cosine similarity */
var N = meta.length;
var scores = new Float32Array(N);
for (var i = 0; i < N; i++) {
var dot = 0;
var off = i * DIM;
for (var d = 0; d < DIM; d++) dot += queryVec[d] * vectors[off + d];
scores[i] = dot;
}
return scores;
}
function topK(scores) {
var indices = Array.from({ length: meta.length }, function (_, i) { return i; });
indices.sort(function (a, b) { return scores[b] - scores[a]; });
return indices.slice(0, TOP_K).map(function (i) {
return { idx: i, score: scores[i] };
});
}
function runSearch(query) {
query = query.trim();
if (!query) { clearResults(); return; }
setStatus('Searching…');
var indexPromise = loadIndex().catch(function (err) {
setStatus('Semantic index not available — run make build first.');
throw err;
});
var modelPromise = loadModel();
Promise.all([indexPromise, modelPromise]).then(function (results) {
var pipe = results[1];
return pipe(query, { pooling: 'mean', normalize: true });
}).then(function (output) {
var queryVec = output.data; /* Float32Array, length 384 */
var scores = cosineSims(queryVec);
var hits = topK(scores);
renderResults(hits);
setStatus(hits.length ? '' : 'No results found.');
}).catch(function (err) {
if (err.message && err.message.indexOf('not available') === -1) {
setStatus('Search error — see console for details.');
console.error('semantic-search:', err);
}
});
}
/* ------------------------------------------------------------------
Rendering
------------------------------------------------------------------ */
function renderResults(hits) {
if (!hits.length) { clearResults(); return; }
var html = '<ol class="semantic-results-list">';
for (var i = 0; i < hits.length; i++) {
var h = hits[i];
var m = meta[h.idx];
var sameHeading = m.heading === m.title;
html += '<li class="semantic-result">'
+ '<a class="semantic-result-title" href="' + esc(m.url) + '">'
+ esc(m.title) + '</a>';
if (!sameHeading) {
html += '<span class="semantic-result-heading"> § ' + esc(m.heading) + '</span>';
}
html += '<p class="semantic-result-excerpt">' + esc(m.excerpt) + '</p>'
+ '</li>';
}
html += '</ol>';
resultsEl.innerHTML = html;
}
function clearResults() {
resultsEl.innerHTML = '';
}
function setStatus(msg) {
statusEl.textContent = msg;
}
function esc(s) {
return String(s)
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;');
}
/* ------------------------------------------------------------------
Tab switching — persists choice in localStorage
------------------------------------------------------------------ */
var STORAGE_KEY = 'search-tab';
function activateTab(target) {
document.querySelectorAll('.search-tab').forEach(function (b) {
var active = b.dataset.tab === target;
b.classList.toggle('is-active', active);
b.setAttribute('aria-selected', active ? 'true' : 'false');
});
document.querySelectorAll('.search-panel').forEach(function (p) {
p.classList.toggle('is-active', p.dataset.panel === target);
});
try { localStorage.setItem(STORAGE_KEY, target); } catch (e) {}
}
document.querySelectorAll('.search-tab').forEach(function (btn) {
btn.addEventListener('click', function () { activateTab(btn.dataset.tab); });
});
/* Restore last-used tab (falls back to keyword if unset or unrecognised) */
var saved = null;
try { saved = localStorage.getItem(STORAGE_KEY); } catch (e) {}
if (saved === 'semantic') activateTab('semantic');
/* ------------------------------------------------------------------
Input handling — debounced, 400 ms
------------------------------------------------------------------ */
var debounceTimer = null;
queryEl.addEventListener('input', function () {
clearTimeout(debounceTimer);
var q = queryEl.value.trim();
if (!q) { clearResults(); setStatus(''); return; }
debounceTimer = setTimeout(function () { runSearch(q); }, 400);
});
/* Pre-fill from ?q= on load — mirror keyword search behaviour */
var params = new URLSearchParams(window.location.search);
var initial = params.get('q');
if (initial) {
queryEl.value = initial;
runSearch(initial);
}
}());