PaddleOCR_keyBERT / keyword_engine.py
mwan2211's picture
Update keyword_engine.py
65d5c2a verified
import re
try:
from keybert import KeyBERT
_HAS_KEYBERT = True
except Exception:
_HAS_KEYBERT = False
_kw = None
BAD_SUBSTR = [
"ingredients", "ingredient", "composition", "warning", "caution", "attention",
"directions", "how to use", "usage", "instructions", "storage", "keep out",
"manufactured", "manufacturer", "distributed", "imported", "company", "address",
"hotline", "tel", "phone", "website", "www", "email", "facebook", "instagram",
"barcode", "qr", "scan", "batch", "lot", "ref", "serial",
"mfg", "exp", "use by", "best before", "nsx", "hsd", "ngày sản xuất", "hạn sử dụng",
"thành phần", "hướng dẫn", "cảnh báo", "bảo quản", "xuất xứ", "nhà sản xuất",
"khối lượng tịnh", "net wt", "net weight", "made in", "origin"
]
def _norm(s: str) -> str:
return re.sub(r"\s+", " ", (s or "")).strip()
def _bad_text(s: str) -> bool:
low = (s or "").lower()
return any(b in low for b in BAD_SUBSTR)
def get_kw():
global _kw
if not _HAS_KEYBERT:
return None
if _kw is None:
try:
_kw = KeyBERT(model="sentence-transformers/all-MiniLM-L6-v2")
except Exception:
_kw = None
return _kw
def _looks_like_packaging_mainline(t: str) -> bool:
low = t.lower()
if _bad_text(t):
return False
if re.search(r"\b(\d{1,4}\s?(ml|l|g|kg|oz|fl oz))\b", low):
return True
if re.search(r"\b(spf\s?\d+|pa\+{1,4}|uv|whitening|moistur|hydration|anti|fresh|mint)\b", low):
return True
if len(t.split()) >= 2 and re.search(r"[A-Za-zÀ-ỹ]", t):
return True
return False
def build_focus_text(items, img_h: int, top_ratio: float = 0.55, max_lines: int = 50):
if not items or not img_h:
return ""
scored = []
for it in items:
t = _norm(it.get("text", ""))
bb = it.get("bbox")
conf = it.get("score", None)
if not t or not bb:
continue
if not re.search(r"[A-Za-zÀ-ỹ]", t):
continue
x1, y1, x2, y2 = bb
cy = (y1 + y2) / 2.0
h = max(1.0, (y2 - y1))
w = max(1.0, (x2 - x1))
area = h * w
words = t.split()
if len(words) == 1 and t.isupper() and len(t) <= 10:
continue
top_bonus = 1.8 if cy <= img_h * top_ratio else 1.0
area_bonus = 0.9 + min(3.2, (area / (img_h * img_h)) * 45.0)
conf_bonus = 1.0
if isinstance(conf, (int, float)):
conf_bonus = 0.85 + min(1.15, float(conf))
intent_bonus = 1.25 if _looks_like_packaging_mainline(t) else 1.0
penalty = 0.75 if _bad_text(t) else 1.0
score = top_bonus * area_bonus * conf_bonus * intent_bonus * penalty
scored.append((score, t))
scored.sort(reverse=True)
chosen = []
seen = set()
for _, t in scored:
k = t.lower()
if k in seen:
continue
seen.add(k)
chosen.append(t)
if len(chosen) >= max_lines:
break
return "\n".join(chosen)
def _simple_keywords(focus_text: str, top_n: int = 5):
lines = [l.strip() for l in (focus_text or "").splitlines() if l.strip()]
out = []
seen = set()
for l in lines:
t = _norm(l)
if _bad_text(t):
continue
k = t.lower()
if k in seen:
continue
seen.add(k)
if len(t) < 3:
continue
out.append(t)
if len(out) >= top_n:
break
return out
def extract_keywords_from_focus(focus_text: str, top_n: int = 5):
focus_text = (focus_text or "").strip()
if not focus_text:
return []
kw = get_kw()
if kw is None:
return _simple_keywords(focus_text, top_n=top_n)
try:
pairs = kw.extract_keywords(
focus_text,
keyphrase_ngram_range=(1, 4),
stop_words=None,
use_mmr=True,
diversity=0.75,
top_n=max(20, top_n * 7),
)
out = []
seen = set()
for phrase, _ in pairs:
p = _norm(phrase)
if len(p) < 3:
continue
if _bad_text(p):
continue
k = p.lower()
if k in seen:
continue
seen.add(k)
out.append(p)
if len(out) == top_n:
break
if out:
return out
return _simple_keywords(focus_text, top_n=top_n)
except Exception:
return _simple_keywords(focus_text, top_n=top_n)