"""Postgres Full-Text Search (Typ 2) + pg_trgm Fuzzy (Typ 3) + Substring fallback.""" import re from app import db from app.schemas import SearchResultItem COLUMNS = """ vehicle_id, commission_number, vin, brand, model, title, price::float8 as price, primary_image_id::text as primary_image_id, directus_product_id """ # Sanitize tsquery input — keep word chars, spaces, dots, hyphens _SAFE_TOKEN = re.compile(r"[^\w\s\.\-]", re.UNICODE) def _build_prefix_tsquery(query: str) -> str: """Convert free-text query into a safe tsquery with prefix match. 'ray' -> 'ray:*' 'zero motor' -> 'zero:* & motor:*' """ cleaned = _SAFE_TOKEN.sub(" ", query) tokens = [t for t in cleaned.split() if t] if not tokens: return "" return " & ".join(f"{t}:*" for t in tokens) async def fts_search(query: str, limit: int = 10) -> list[SearchResultItem]: """Full-text search with German + unaccent dictionary, prefix-enabled.""" tsquery_str = _build_prefix_tsquery(query) if not tsquery_str: return [] sql = f""" SELECT {COLUMNS}, ts_rank_cd(search_tsv, q)::float8 AS rank FROM search_vehicles, to_tsquery('german_unaccent', $1) q WHERE search_tsv @@ q ORDER BY rank DESC, brand, model LIMIT $2 """ try: rows = await db.fetch(sql, tsquery_str, limit) except Exception: sql_fallback = f""" SELECT {COLUMNS}, ts_rank_cd(search_tsv, q)::float8 AS rank FROM search_vehicles, plainto_tsquery('german_unaccent', $1) q WHERE search_tsv @@ q ORDER BY rank DESC, brand, model LIMIT $2 """ rows = await db.fetch(sql_fallback, query, limit) return [_to_item(r, "fts", float(r["rank"] or 0)) for r in rows] async def fuzzy_search(query: str, limit: int = 10) -> list[SearchResultItem]: """Trigram similarity for typo tolerance.""" sql = f""" WITH scored AS ( SELECT {COLUMNS}, GREATEST( similarity(title, $1), similarity(COALESCE(commission_number, ''), $1) )::float8 AS sim FROM search_vehicles WHERE title % $1 OR commission_number % $1 ) SELECT * FROM scored WHERE sim > 0.25 ORDER BY sim DESC LIMIT $2 """ rows = await db.fetch(sql, query, limit) return [_to_item(r, "fuzzy", float(r["sim"])) for r in rows] async def substring_search(query: str, limit: int = 10) -> list[SearchResultItem]: """Last-resort: ILIKE substring on title/brand/model. Catches stop-words (e.g. German 'es') that FTS strips, and very short queries that don't survive stemming. Uses the existing trgm GIN index. """ pattern = f"%{query.lower()}%" sql = f""" SELECT {COLUMNS}, 0.5::float8 AS rank FROM search_vehicles WHERE LOWER(title) LIKE $1 OR LOWER(brand) LIKE $1 OR LOWER(COALESCE(model, '')) LIKE $1 ORDER BY brand, model LIMIT $2 """ rows = await db.fetch(sql, pattern, limit) return [_to_item(r, "substring", 0.5) for r in rows] async def hybrid_search(query: str, limit: int = 10) -> list[SearchResultItem]: """FTS first; if too few hits, add fuzzy; if still empty, add substring. Strategy: 1. Always run FTS (cheap, indexed, prefix-enabled) 2. If FTS has >= 3 results, return them 3. Otherwise also run fuzzy and merge 4. If still nothing, run substring fallback (handles stop-words) """ fts_results = await fts_search(query, limit=limit) if len(fts_results) >= 3: return fts_results fuzzy_results = await fuzzy_search(query, limit=limit) seen: dict[int, SearchResultItem] = {r.vehicle_id: r for r in fts_results} for r in fuzzy_results: if r.vehicle_id not in seen: seen[r.vehicle_id] = r if not seen: # Last resort: substring (catches stop-words like 'es') substring_results = await substring_search(query, limit=limit) for r in substring_results: seen[r.vehicle_id] = r merged = list(seen.values()) merged.sort(key=lambda x: ( 0 if x.matched_via == "fts" else (1 if x.matched_via == "fuzzy" else 2), -x.score, )) return merged[:limit] def _to_item(r, matched_via: str, score: float) -> SearchResultItem: return SearchResultItem( vehicle_id=r["vehicle_id"], commission_number=r["commission_number"], vin=r["vin"], brand=r["brand"], model=r["model"], title=r["title"] or "", price=r["price"], primary_image_id=r["primary_image_id"], directus_product_id=r["directus_product_id"], score=score, matched_via=matched_via, )