Spaces:
Running
Running
File size: 2,529 Bytes
a2e4cc4 fb8d729 a2e4cc4 fb8d729 a2e4cc4 fb8d729 a2e4cc4 fb8d729 a2e4cc4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import numpy as np
import urllib.parse
import traceback
from ocr_engine import ocr_items, ocr_text
from keyword_engine import build_focus_text, extract_keywords_from_focus
def build_link(keywords, full_text):
if keywords:
query = " ".join(keywords)
shown = keywords
else:
query = (full_text or "")[:120]
shown = ["(dùng văn bản gốc)"]
url = "https://www.google.com/search?q=" + urllib.parse.quote(query)
html = f"""
<div style="text-align:center;padding:20px;border-radius:10px;
background:#f0f9ff;border:1px solid #cce3de">
<h3 style="color:#d90429">{', '.join(shown)}</h3>
<a href="{url}" target="_blank"
style="background:#4285F4;color:white;padding:10px 20px;
border-radius:5px;font-weight:bold;text-decoration:none">
Tra cứu Google
</a>
</div>
"""
return html, ", ".join(shown)
def process(image):
if image is None:
return "Chưa có ảnh đầu vào.", "", ""
try:
if not isinstance(image, np.ndarray):
return "Định dạng ảnh không hợp lệ.", "", ""
full_text = ocr_text(image)
if not full_text:
return "Không trích xuất được nội dung.", "", ""
items = ocr_items(image)
focus_text = build_focus_text(items, img_h=image.shape[0], top_ratio=0.35, max_lines=30)
if not focus_text:
focus_text = full_text
keywords = extract_keywords_from_focus(focus_text, top_n=5)
html_link, keywords_str = build_link(keywords, full_text)
return full_text, keywords_str, html_link
except Exception as e:
return f"Lỗi hệ thống:\n{e}\n{traceback.format_exc()}", "", ""
with gr.Blocks(title="OCR & Tra cứu thông minh") as app:
gr.Markdown("# OCR – PaddleOCR + KeyBERT")
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(label="Tải ảnh", type="numpy")
btn_submit = gr.Button("Bắt đầu trích xuất", variant="primary")
with gr.Column(scale=1):
output_link = gr.HTML(label="Link tra cứu")
output_text = gr.Textbox(label="Văn bản OCR", lines=10)
output_keywords = gr.Textbox(label="Từ khóa", lines=5)
btn_submit.click(
fn=process,
inputs=input_image,
outputs=[output_text, output_keywords, output_link],
)
if __name__ == "__main__":
app.launch()
|