Spaces:

mwan2211
/

PaddleOCR_keyBERT

Running

File size: 2,529 Bytes

a2e4cc4
 
 
 
fb8d729
 
a2e4cc4
 
 
 
 
 
fb8d729
a2e4cc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb8d729
a2e4cc4
 
 
fb8d729
 
 
 
 
 
a2e4cc4

import gradio as gr
import numpy as np
import urllib.parse
import traceback
from ocr_engine import ocr_items, ocr_text
from keyword_engine import build_focus_text, extract_keywords_from_focus

def build_link(keywords, full_text):
    if keywords:
        query = " ".join(keywords)
        shown = keywords
    else:
        query = (full_text or "")[:120]
        shown = ["(dùng văn bản gốc)"]

    url = "https://www.google.com/search?q=" + urllib.parse.quote(query)
    html = f"""
    <div style="text-align:center;padding:20px;border-radius:10px;
                background:#f0f9ff;border:1px solid #cce3de">
        <h3 style="color:#d90429">{', '.join(shown)}</h3>
        <a href="{url}" target="_blank"
           style="background:#4285F4;color:white;padding:10px 20px;
                  border-radius:5px;font-weight:bold;text-decoration:none">
           Tra cứu Google
        </a>
    </div>
    """
    return html, ", ".join(shown)

def process(image):
    if image is None:
        return "Chưa có ảnh đầu vào.", "", ""

    try:
        if not isinstance(image, np.ndarray):
            return "Định dạng ảnh không hợp lệ.", "", ""

        full_text = ocr_text(image)
        if not full_text:
            return "Không trích xuất được nội dung.", "", ""

        items = ocr_items(image)
        focus_text = build_focus_text(items, img_h=image.shape[0], top_ratio=0.35, max_lines=30)
        if not focus_text:
            focus_text = full_text

        keywords = extract_keywords_from_focus(focus_text, top_n=5)
        html_link, keywords_str = build_link(keywords, full_text)
        return full_text, keywords_str, html_link

    except Exception as e:
        return f"Lỗi hệ thống:\n{e}\n{traceback.format_exc()}", "", ""

with gr.Blocks(title="OCR & Tra cứu thông minh") as app:
    gr.Markdown("# OCR – PaddleOCR + KeyBERT")

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(label="Tải ảnh", type="numpy")
            btn_submit = gr.Button("Bắt đầu trích xuất", variant="primary")

        with gr.Column(scale=1):
            output_link = gr.HTML(label="Link tra cứu")
            output_text = gr.Textbox(label="Văn bản OCR", lines=10)
            output_keywords = gr.Textbox(label="Từ khóa", lines=5)

    btn_submit.click(
        fn=process,
        inputs=input_image,
        outputs=[output_text, output_keywords, output_link],
    )

if __name__ == "__main__":
    app.launch()