mwan2211 commited on
Commit
fb8d729
·
verified ·
1 Parent(s): 8bcdb9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -5
app.py CHANGED
@@ -2,15 +2,15 @@ import gradio as gr
2
  import numpy as np
3
  import urllib.parse
4
  import traceback
5
- from ocr_engine import ocr_image
6
- from keyword_engine import extract_keywords
7
 
8
  def build_link(keywords, full_text):
9
  if keywords:
10
  query = " ".join(keywords)
11
  shown = keywords
12
  else:
13
- query = (full_text or "")[:100]
14
  shown = ["(dùng văn bản gốc)"]
15
 
16
  url = "https://www.google.com/search?q=" + urllib.parse.quote(query)
@@ -35,11 +35,16 @@ def process(image):
35
  if not isinstance(image, np.ndarray):
36
  return "Định dạng ảnh không hợp lệ.", "", ""
37
 
38
- full_text = ocr_image(image)
39
  if not full_text:
40
  return "Không trích xuất được nội dung.", "", ""
41
 
42
- keywords = extract_keywords(full_text, top_n=5)
 
 
 
 
 
43
  html_link, keywords_str = build_link(keywords, full_text)
44
  return full_text, keywords_str, html_link
45
 
 
2
  import numpy as np
3
  import urllib.parse
4
  import traceback
5
+ from ocr_engine import ocr_items, ocr_text
6
+ from keyword_engine import build_focus_text, extract_keywords_from_focus
7
 
8
  def build_link(keywords, full_text):
9
  if keywords:
10
  query = " ".join(keywords)
11
  shown = keywords
12
  else:
13
+ query = (full_text or "")[:120]
14
  shown = ["(dùng văn bản gốc)"]
15
 
16
  url = "https://www.google.com/search?q=" + urllib.parse.quote(query)
 
35
  if not isinstance(image, np.ndarray):
36
  return "Định dạng ảnh không hợp lệ.", "", ""
37
 
38
+ full_text = ocr_text(image)
39
  if not full_text:
40
  return "Không trích xuất được nội dung.", "", ""
41
 
42
+ items = ocr_items(image)
43
+ focus_text = build_focus_text(items, img_h=image.shape[0], top_ratio=0.35, max_lines=30)
44
+ if not focus_text:
45
+ focus_text = full_text
46
+
47
+ keywords = extract_keywords_from_focus(focus_text, top_n=5)
48
  html_link, keywords_str = build_link(keywords, full_text)
49
  return full_text, keywords_str, html_link
50