!pip install -q dyf[io] datasets sentence-transformersDYF Quickstart — Density Yields Features
DYF discovers structure in embedding spaces using density-based LSH. Every item gets classified as Dense (core), Bridge (connector), or Orphan (unique) — no k to pick, no hyperparameters to tune.
This notebook walks through: 1. Embedding real text data (AG News headlines) 2. Density classification — what’s core, what’s bridging, what’s unique? 3. Building a .dyf search index 4. Searching with adaptive probing
1. Install dependencies
2. Load AG News
AG News has 4 categories: World, Sports, Business, Sci/Tech. We grab 5,000 headlines for speed.
from datasets import load_dataset
ds = load_dataset("ag_news", split="train[:5000]")
AG_LABELS = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
texts = ds["text"]
categories = [AG_LABELS[l] for l in ds["label"]]
print(f"{len(texts)} headlines loaded")
print(f"Example: {texts[0][:100]}...")3. Embed with sentence-transformers
all-MiniLM-L6-v2 produces 384-d embeddings and runs fast on CPU.
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
print(f"Embeddings shape: {embeddings.shape}")4. Density classification
DYF’s DensityClassifier buckets items via PCA-based LSH, then computes per-item density metrics. No k to choose, no cluster count to guess.
from dyf import DensityClassifier
classifier = DensityClassifier(
embedding_dim=384,
num_bits=14,
num_stability_seeds=3,
)
classifier.fit(embeddings)
print(classifier.report())5. Inspect Dense, Bridge, and Orphan items
- Dense: High centroid similarity — the item sits squarely in a well-populated region
- Bridge: Low centroid similarity — the item connects different semantic regions
- Orphan: Small bucket — the item has few semantic neighbors
We use analyze_bridges() to find bridge items and their connections.
# Get per-item metrics (returned as numpy arrays)centroid_sims = classifier.get_centroid_similarities()bucket_sizes = classifier.get_bucket_sizes()# Simple classification from raw metricsis_orphan = bucket_sizes <= 2is_bridge = (~is_orphan) & (centroid_sims < 0.5)is_dense = (~is_orphan) & (centroid_sims >= 0.5)n = len(texts)print(f"Dense: {is_dense.sum():>5} ({is_dense.sum()/n:.1%}) — core items")print(f"Bridge: {is_bridge.sum():>5} ({is_bridge.sum()/n:.1%}) — connectors")print(f"Orphan: {is_orphan.sum():>5} ({is_orphan.sum()/n:.1%}) — unique items")print("\n--- Example Dense headlines (high centroid similarity) ---")dense_idx = np.where(is_dense)[0]for i in dense_idx[:3]: print(f" [{categories[i]}] sim={centroid_sims[i]:.3f} {texts[i][:90]}")print("\n--- Example Orphan headlines (tiny buckets) ---")orphan_idx = np.where(is_orphan)[0]for i in orphan_idx[:3]: print(f" [{categories[i]}] bucket_size={bucket_sizes[i]} {texts[i][:90]}")# Bridge analysis — which regions do bridges connect?
analysis = classifier.analyze_bridges(embeddings, bridge_threshold=0.5)
print(analysis)
print("\nTop connected bucket pairs:")
for bucket_a, bucket_b, count in analysis.top_connected_pairs(5):
print(f" Bucket {bucket_a} <-> Bucket {bucket_b}: {count} bridges")
# Show a few bridge headlines
print("\n--- Example Bridge headlines ---")
for i in analysis.bridge_indices[:5]:
print(f" [{categories[i]}] sim={centroid_sims[i]:.3f} {texts[i][:90]}")6. Build a .dyf search index
A .dyf file is a memory-mapped index with zero startup cost. We store the headline text and category alongside the embeddings.
from dyf import build_dyf_tree, write_lazy_index
tree = build_dyf_tree(embeddings, max_depth=4, num_bits=3, min_leaf_size=8)
write_lazy_index(
tree, embeddings, "ag_news.dyf",
quantization="float16",
stored_fields={"title": texts, "category": categories},
metadata={"model": "all-MiniLM-L6-v2", "dataset": "ag_news"},
)
print("Index written to ag_news.dyf")7. Search the index
Open the .dyf file and search with a query embedding. Stored fields come back with results.
from dyf import LazyIndex
query = model.encode("stock market crash")
with LazyIndex("ag_news.dyf") as idx:
print(f"Index: {idx.total_items} items, {idx.embedding_dim}d embeddings")
print(f"Stored fields: {idx.stored_field_names}\n")
result = idx.search(query, k=10)
print('Query: "stock market crash"\n')
for i, (score, title, cat) in enumerate(
zip(result.scores, result.fields["title"], result.fields["category"])
):
print(f" {i+1:2d}. [{cat}] {score:.3f} {title[:80]}")8. Adaptive probing
With nprobe="auto", DYF decides how many tree leaves to search based on how confident the routing is. Unambiguous queries probe fewer leaves (faster); ambiguous queries probe more (better recall).
queries = [
"NBA playoffs basketball", # clearly Sports
"new iPhone release", # clearly Sci/Tech
"tech companies report earnings", # Sci/Tech + Business overlap
]
with LazyIndex("ag_news.dyf") as idx:
for q in queries:
qvec = model.encode(q)
result = idx.search(qvec, k=5, nprobe="auto", return_routing=True)
r = result.routing
print(f'Query: "{q}"')
print(f" Leaves probed: {len(r['leaves_probed'])} "
f"Candidates scored: {r['candidates_scored']} "
f"Min margin: {r['min_margin']:.4f} "
f"Elapsed: {r['elapsed_ms']:.1f}ms")
print(f" Top result: [{result.fields['category'][0]}] "
f"{result.fields['title'][0][:70]}")
print()9. Tree leaves = recursive density classificationbuild_dyf_tree recursively applies DensityClassifier at each level — each leaf is a density bucket from that process.To verify, we extract leaf contents from the .dyf index and measure their cohesion: how tightly clustered the items are compared to a random sample of the same size.
with LazyIndex("ag_news.dyf") as idx: tree_nodes = idx.get_tree_structure() leaves = [n for n in tree_nodes if n["is_leaf"]] print(f"Tree has {len(tree_nodes)} nodes, {len(leaves)} leaves\n") # Measure cohesion: mean pairwise cosine similarity to the leaf centroid leaf_cohesions = [] random_cohesions = [] rng = np.random.default_rng(42) for leaf in leaves: batch = idx.get_leaf(leaf["batch_index"]) emb_col = batch.column("embedding") leaf_embs = emb_col.values.to_numpy(zero_copy_only=False).reshape( -1, emb_col.type.list_size ) if len(leaf_embs) < 3: continue # Leaf cohesion: mean cosine similarity to centroid centroid = leaf_embs.mean(axis=0) centroid /= np.linalg.norm(centroid) norms = np.linalg.norm(leaf_embs, axis=1, keepdims=True) normed = leaf_embs / np.clip(norms, 1e-8, None) leaf_sim = float(np.mean(normed @ centroid)) # Random baseline: sample same number of items from full dataset rand_idx = rng.choice(len(embeddings), size=len(leaf_embs), replace=False) rand_embs = embeddings[rand_idx] rand_centroid = rand_embs.mean(axis=0) rand_centroid /= np.linalg.norm(rand_centroid) rand_norms = np.linalg.norm(rand_embs, axis=1, keepdims=True) rand_normed = rand_embs / np.clip(rand_norms, 1e-8, None) rand_sim = float(np.mean(rand_normed @ rand_centroid)) leaf_cohesions.append(leaf_sim) random_cohesions.append(rand_sim) leaf_cohesions = np.array(leaf_cohesions) random_cohesions = np.array(random_cohesions) print(f"Leaf centroid similarity: mean={leaf_cohesions.mean():.3f} " f"median={np.median(leaf_cohesions):.3f}") print(f"Random sample similarity: mean={random_cohesions.mean():.3f} " f"median={np.median(random_cohesions):.3f}") print(f"\nLeaves are {leaf_cohesions.mean() / random_cohesions.mean():.1f}x " f"more cohesive than random samples of the same size.") print("Each leaf is a genuine density cluster, not an arbitrary partition.")10. Proving orphans are genuinely isolatedDYF labels items with tiny buckets as “orphans.” But are they really isolated, or just unlucky hash collisions?We validate by computing k-nearest-neighbor cosine similarities directly on the embeddings — no LSH involved.If orphans are genuine, their nearest neighbors should be much more distant than a dense item’s neighbors.
def knn_cosine_sims(query_emb, all_embs, k=10): """Compute cosine similarities to k nearest neighbors (brute force).""" query_norm = query_emb / np.linalg.norm(query_emb) norms = np.linalg.norm(all_embs, axis=1, keepdims=True) normed = all_embs / np.clip(norms, 1e-8, None) sims = normed @ query_norm top_k = np.argsort(sims)[-k-1:-1][::-1] # exclude self return sims[top_k]# Pick a concrete orphan and a concrete dense itemorphan_idx = np.where(is_orphan)[0]dense_idx = np.where(is_dense)[0]# Choose the orphan with smallest bucket and densest item with highest centroid simorphan_i = orphan_idx[np.argmin(bucket_sizes[orphan_idx])]dense_i = dense_idx[np.argmax(centroid_sims[dense_idx])]print(f"Orphan (idx={orphan_i}):")print(f" Text: {texts[orphan_i][:100]}")print(f" Bucket size: {bucket_sizes[orphan_i]}, Centroid sim: {centroid_sims[orphan_i]:.3f}")orphan_knn = knn_cosine_sims(embeddings[orphan_i], embeddings, k=10)print(f" 10-NN cosine sims: mean={orphan_knn.mean():.3f}, " f"min={orphan_knn.min():.3f}, max={orphan_knn.max():.3f}")print(f"\nDense item (idx={dense_i}):")print(f" Text: {texts[dense_i][:100]}")print(f" Bucket size: {bucket_sizes[dense_i]}, Centroid sim: {centroid_sims[dense_i]:.3f}")dense_knn = knn_cosine_sims(embeddings[dense_i], embeddings, k=10)print(f" 10-NN cosine sims: mean={dense_knn.mean():.3f}, " f"min={dense_knn.min():.3f}, max={dense_knn.max():.3f}")# Aggregate over all orphans vs all dense itemsprint("\n--- Aggregate: mean 10-NN similarity ---")orphan_means = [knn_cosine_sims(embeddings[i], embeddings, k=10).mean() for i in orphan_idx[:100]] # sample up to 100 for speeddense_means = [knn_cosine_sims(embeddings[i], embeddings, k=10).mean() for i in dense_idx[:100]]print(f"Orphans (n={len(orphan_means)}): mean 10-NN sim = {np.mean(orphan_means):.3f}")print(f"Dense (n={len(dense_means)}): mean 10-NN sim = {np.mean(dense_means):.3f}")print(f"\nDense items' neighbors are {np.mean(dense_means)/np.mean(orphan_means):.2f}x " f"more similar — orphans are genuinely isolated.")11. Proving bridges connect distinct regionsbridge_persistence() finds items that act as connectors across multiple LSH resolutions — the more resolutions an item bridges, the more structurally important it is. We validate by computing direct cosine similarities to bucket centroids: a true bridge should be roughly equidistant between two regions, while a dense item is strongly tied to one.
# Find persistent bridges — items that connect regions across multiple LSH resolutionsbp = classifier.bridge_persistence(embeddings, relative_threshold=0.8)persistence = np.array(bp.bridge_persistence)print(f"Items with persistence > 0: {(persistence > 0).sum()}")print(f"Max persistence: {persistence.max()} (out of {classifier.num_bits} resolutions)\n")# Pick the most persistent bridgebridge_i = int(np.argmax(persistence))bridge_persist, bridge_min_d, bridge_max_d, bridge_ratio = bp.item_profile(bridge_i)print(f"Most persistent bridge (idx={bridge_i}):")print(f" Text: {texts[bridge_i][:100]}")print(f" Persistence: {bridge_persist} depths (range {bridge_min_d}-{bridge_max_d}), " f"peak ratio: {bridge_ratio:.2f}")# Compute bucket centroids and show the bridge sits between regionsbucket_ids = classifier.get_bucket_ids()own_bucket = bucket_ids[bridge_i]own_mask = bucket_ids == own_bucketown_centroid = embeddings[own_mask].mean(axis=0)own_centroid /= np.linalg.norm(own_centroid)# Find nearest other bucket centroidbridge_emb = embeddings[bridge_i] / np.linalg.norm(embeddings[bridge_i])unique_buckets = np.unique(bucket_ids)best_other_sim = -1.0best_other_bucket = -1for b in unique_buckets: if b == own_bucket: continue mask = bucket_ids == b if mask.sum() < 2: continue centroid = embeddings[mask].mean(axis=0) centroid /= np.linalg.norm(centroid) sim = float(bridge_emb @ centroid) if sim > best_other_sim: best_other_sim = sim best_other_bucket = bown_sim = float(bridge_emb @ own_centroid)print(f"\n Similarity to own bucket centroid (bucket {own_bucket}): {own_sim:.3f}")print(f" Similarity to nearest other centroid (bucket {best_other_bucket}): {best_other_sim:.3f}")print(f" Ratio (other/own): {best_other_sim/own_sim:.2f} — bridge is pulled toward both regions")# Contrast with a dense itemdense_i = dense_idx[np.argmax(centroid_sims[dense_idx])]dense_bucket = bucket_ids[dense_i]dense_mask = bucket_ids == dense_bucketdense_own_centroid = embeddings[dense_mask].mean(axis=0)dense_own_centroid /= np.linalg.norm(dense_own_centroid)dense_emb = embeddings[dense_i] / np.linalg.norm(embeddings[dense_i])dense_own_sim = float(dense_emb @ dense_own_centroid)dense_best_other = -1.0for b in unique_buckets: if b == dense_bucket: continue mask = bucket_ids == b if mask.sum() < 2: continue centroid = embeddings[mask].mean(axis=0) centroid /= np.linalg.norm(centroid) sim = float(dense_emb @ centroid) if sim > dense_best_other: dense_best_other = simprint(f"\nDense item (idx={dense_i}) for contrast:")print(f" Similarity to own centroid: {dense_own_sim:.3f}")print(f" Similarity to nearest other: {dense_best_other:.3f}")print(f" Ratio (other/own): {dense_best_other/dense_own_sim:.2f} — " f"dense item is anchored to its own region")print(f"\nBridge ratio {best_other_sim/own_sim:.2f} vs dense ratio " f"{dense_best_other/dense_own_sim:.2f} — bridges genuinely straddle regions.")Next steps
- How It Works — the PCA-LSH algorithm, density metrics, and Krapivin hash density
- API Reference — full docs for
DensityClassifier,LazyIndex, and more - GitHub — source, issues, and examples