Spaces:

mwan2211
/

PaddleOCR_keyBERT

Running

App Files Files Community

PaddleOCR_keyBERT / keyword_engine.py

mwan2211

Update keyword_engine.py

65d5c2a verified 9 days ago

raw

history blame contribute delete

4.61 kB

	import re

	try:
	from keybert import KeyBERT
	_HAS_KEYBERT = True
	except Exception:
	_HAS_KEYBERT = False

	_kw = None

	BAD_SUBSTR = [
	"ingredients", "ingredient", "composition", "warning", "caution", "attention",
	"directions", "how to use", "usage", "instructions", "storage", "keep out",
	"manufactured", "manufacturer", "distributed", "imported", "company", "address",
	"hotline", "tel", "phone", "website", "www", "email", "facebook", "instagram",
	"barcode", "qr", "scan", "batch", "lot", "ref", "serial",
	"mfg", "exp", "use by", "best before", "nsx", "hsd", "ngày sản xuất", "hạn sử dụng",
	"thành phần", "hướng dẫn", "cảnh báo", "bảo quản", "xuất xứ", "nhà sản xuất",
	"khối lượng tịnh", "net wt", "net weight", "made in", "origin"
	]

	def _norm(s: str) -> str:
	return re.sub(r"\s+", " ", (s or "")).strip()

	def _bad_text(s: str) -> bool:
	low = (s or "").lower()
	return any(b in low for b in BAD_SUBSTR)

	def get_kw():
	global _kw
	if not _HAS_KEYBERT:
	return None
	if _kw is None:
	try:
	_kw = KeyBERT(model="sentence-transformers/all-MiniLM-L6-v2")
	except Exception:
	_kw = None
	return _kw

	def _looks_like_packaging_mainline(t: str) -> bool:
	low = t.lower()
	if _bad_text(t):
	return False
	if re.search(r"\b(\d{1,4}\s?(ml\|l\|g\|kg\|oz\|fl oz))\b", low):
	return True
	if re.search(r"\b(spf\s?\d+\|pa\+{1,4}\|uv\|whitening\|moistur\|hydration\|anti\|fresh\|mint)\b", low):
	return True
	if len(t.split()) >= 2 and re.search(r"[A-Za-zÀ-ỹ]", t):
	return True
	return False

	def build_focus_text(items, img_h: int, top_ratio: float = 0.55, max_lines: int = 50):
	if not items or not img_h:
	return ""

	scored = []
	for it in items:
	t = _norm(it.get("text", ""))
	bb = it.get("bbox")
	conf = it.get("score", None)

	if not t or not bb:
	continue
	if not re.search(r"[A-Za-zÀ-ỹ]", t):
	continue

	x1, y1, x2, y2 = bb
	cy = (y1 + y2) / 2.0
	h = max(1.0, (y2 - y1))
	w = max(1.0, (x2 - x1))
	area = h * w

	words = t.split()
	if len(words) == 1 and t.isupper() and len(t) <= 10:
	continue

	top_bonus = 1.8 if cy <= img_h * top_ratio else 1.0
	area_bonus = 0.9 + min(3.2, (area / (img_h * img_h)) * 45.0)

	conf_bonus = 1.0
	if isinstance(conf, (int, float)):
	conf_bonus = 0.85 + min(1.15, float(conf))

	intent_bonus = 1.25 if _looks_like_packaging_mainline(t) else 1.0
	penalty = 0.75 if _bad_text(t) else 1.0

	score = top_bonus * area_bonus * conf_bonus * intent_bonus * penalty
	scored.append((score, t))

	scored.sort(reverse=True)

	chosen = []
	seen = set()
	for _, t in scored:
	k = t.lower()
	if k in seen:
	continue
	seen.add(k)
	chosen.append(t)
	if len(chosen) >= max_lines:
	break

	return "\n".join(chosen)

	def _simple_keywords(focus_text: str, top_n: int = 5):
	lines = [l.strip() for l in (focus_text or "").splitlines() if l.strip()]
	out = []
	seen = set()
	for l in lines:
	t = _norm(l)
	if _bad_text(t):
	continue
	k = t.lower()
	if k in seen:
	continue
	seen.add(k)
	if len(t) < 3:
	continue
	out.append(t)
	if len(out) >= top_n:
	break
	return out

	def extract_keywords_from_focus(focus_text: str, top_n: int = 5):
	focus_text = (focus_text or "").strip()
	if not focus_text:
	return []

	kw = get_kw()
	if kw is None:
	return _simple_keywords(focus_text, top_n=top_n)

	try:
	pairs = kw.extract_keywords(
	focus_text,
	keyphrase_ngram_range=(1, 4),
	stop_words=None,
	use_mmr=True,
	diversity=0.75,
	top_n=max(20, top_n * 7),
	)

	out = []
	seen = set()

	for phrase, _ in pairs:
	p = _norm(phrase)
	if len(p) < 3:
	continue
	if _bad_text(p):
	continue
	k = p.lower()
	if k in seen:
	continue
	seen.add(k)
	out.append(p)
	if len(out) == top_n:
	break

	if out:
	return out
	return _simple_keywords(focus_text, top_n=top_n)

	except Exception:
	return _simple_keywords(focus_text, top_n=top_n)