Spaces:
Running
Running
| import re | |
| try: | |
| from keybert import KeyBERT | |
| _HAS_KEYBERT = True | |
| except Exception: | |
| _HAS_KEYBERT = False | |
| _kw = None | |
| BAD_SUBSTR = [ | |
| "ingredients", "ingredient", "composition", "warning", "caution", "attention", | |
| "directions", "how to use", "usage", "instructions", "storage", "keep out", | |
| "manufactured", "manufacturer", "distributed", "imported", "company", "address", | |
| "hotline", "tel", "phone", "website", "www", "email", "facebook", "instagram", | |
| "barcode", "qr", "scan", "batch", "lot", "ref", "serial", | |
| "mfg", "exp", "use by", "best before", "nsx", "hsd", "ngày sản xuất", "hạn sử dụng", | |
| "thành phần", "hướng dẫn", "cảnh báo", "bảo quản", "xuất xứ", "nhà sản xuất", | |
| "khối lượng tịnh", "net wt", "net weight", "made in", "origin" | |
| ] | |
| def _norm(s: str) -> str: | |
| return re.sub(r"\s+", " ", (s or "")).strip() | |
| def _bad_text(s: str) -> bool: | |
| low = (s or "").lower() | |
| return any(b in low for b in BAD_SUBSTR) | |
| def get_kw(): | |
| global _kw | |
| if not _HAS_KEYBERT: | |
| return None | |
| if _kw is None: | |
| try: | |
| _kw = KeyBERT(model="sentence-transformers/all-MiniLM-L6-v2") | |
| except Exception: | |
| _kw = None | |
| return _kw | |
| def _looks_like_packaging_mainline(t: str) -> bool: | |
| low = t.lower() | |
| if _bad_text(t): | |
| return False | |
| if re.search(r"\b(\d{1,4}\s?(ml|l|g|kg|oz|fl oz))\b", low): | |
| return True | |
| if re.search(r"\b(spf\s?\d+|pa\+{1,4}|uv|whitening|moistur|hydration|anti|fresh|mint)\b", low): | |
| return True | |
| if len(t.split()) >= 2 and re.search(r"[A-Za-zÀ-ỹ]", t): | |
| return True | |
| return False | |
| def build_focus_text(items, img_h: int, top_ratio: float = 0.55, max_lines: int = 50): | |
| if not items or not img_h: | |
| return "" | |
| scored = [] | |
| for it in items: | |
| t = _norm(it.get("text", "")) | |
| bb = it.get("bbox") | |
| conf = it.get("score", None) | |
| if not t or not bb: | |
| continue | |
| if not re.search(r"[A-Za-zÀ-ỹ]", t): | |
| continue | |
| x1, y1, x2, y2 = bb | |
| cy = (y1 + y2) / 2.0 | |
| h = max(1.0, (y2 - y1)) | |
| w = max(1.0, (x2 - x1)) | |
| area = h * w | |
| words = t.split() | |
| if len(words) == 1 and t.isupper() and len(t) <= 10: | |
| continue | |
| top_bonus = 1.8 if cy <= img_h * top_ratio else 1.0 | |
| area_bonus = 0.9 + min(3.2, (area / (img_h * img_h)) * 45.0) | |
| conf_bonus = 1.0 | |
| if isinstance(conf, (int, float)): | |
| conf_bonus = 0.85 + min(1.15, float(conf)) | |
| intent_bonus = 1.25 if _looks_like_packaging_mainline(t) else 1.0 | |
| penalty = 0.75 if _bad_text(t) else 1.0 | |
| score = top_bonus * area_bonus * conf_bonus * intent_bonus * penalty | |
| scored.append((score, t)) | |
| scored.sort(reverse=True) | |
| chosen = [] | |
| seen = set() | |
| for _, t in scored: | |
| k = t.lower() | |
| if k in seen: | |
| continue | |
| seen.add(k) | |
| chosen.append(t) | |
| if len(chosen) >= max_lines: | |
| break | |
| return "\n".join(chosen) | |
| def _simple_keywords(focus_text: str, top_n: int = 5): | |
| lines = [l.strip() for l in (focus_text or "").splitlines() if l.strip()] | |
| out = [] | |
| seen = set() | |
| for l in lines: | |
| t = _norm(l) | |
| if _bad_text(t): | |
| continue | |
| k = t.lower() | |
| if k in seen: | |
| continue | |
| seen.add(k) | |
| if len(t) < 3: | |
| continue | |
| out.append(t) | |
| if len(out) >= top_n: | |
| break | |
| return out | |
| def extract_keywords_from_focus(focus_text: str, top_n: int = 5): | |
| focus_text = (focus_text or "").strip() | |
| if not focus_text: | |
| return [] | |
| kw = get_kw() | |
| if kw is None: | |
| return _simple_keywords(focus_text, top_n=top_n) | |
| try: | |
| pairs = kw.extract_keywords( | |
| focus_text, | |
| keyphrase_ngram_range=(1, 4), | |
| stop_words=None, | |
| use_mmr=True, | |
| diversity=0.75, | |
| top_n=max(20, top_n * 7), | |
| ) | |
| out = [] | |
| seen = set() | |
| for phrase, _ in pairs: | |
| p = _norm(phrase) | |
| if len(p) < 3: | |
| continue | |
| if _bad_text(p): | |
| continue | |
| k = p.lower() | |
| if k in seen: | |
| continue | |
| seen.add(k) | |
| out.append(p) | |
| if len(out) == top_n: | |
| break | |
| if out: | |
| return out | |
| return _simple_keywords(focus_text, top_n=top_n) | |
| except Exception: | |
| return _simple_keywords(focus_text, top_n=top_n) | |