kin525 commited on
Commit
6171ba1
·
verified ·
1 Parent(s): 01edb13

Update core/file_processor.py

Browse files
Files changed (1) hide show
  1. core/file_processor.py +179 -219
core/file_processor.py CHANGED
@@ -1,208 +1,249 @@
1
  # core/file_processor.py
2
  import os
3
  import tempfile
4
- from typing import Dict, Any
 
5
  import pandas as pd
6
  from io import BytesIO
7
  import base64
8
  import logging
9
 
 
10
  try:
11
  import fitz # PyMuPDF
12
  PDF_SUPPORT = True
13
- except ImportError:
 
 
14
  PDF_SUPPORT = False
15
- print("警告: PyMuPDF 不可用,PDF 處理將被停用")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  try:
18
  from docx import Document
19
  DOCX_SUPPORT = True
20
- except ImportError:
21
  DOCX_SUPPORT = False
22
- print("警告: python-docx 不可用,Word 處理將被停用")
23
 
24
  try:
25
  import pytesseract
26
  from PIL import Image
27
  OCR_SUPPORT = True
28
- except ImportError:
29
  OCR_SUPPORT = False
30
- print("警告: OCR 功能不可用")
31
 
32
  class FileProcessor:
33
  def __init__(self):
34
  self.logger = logging.getLogger(__name__)
35
- self.supported_types = []
36
 
37
- # 根據可用的庫動態支持文件類型
38
  if PDF_SUPPORT:
39
  self.supported_types.extend(['pdf'])
40
- self.logger.info("PDF 處理支持已啟用")
41
-
42
  if DOCX_SUPPORT:
43
  self.supported_types.extend(['docx', 'doc'])
44
  self.logger.info("Word 處理支持已啟用")
45
-
46
  if OCR_SUPPORT:
47
  self.supported_types.extend(['jpg', 'jpeg', 'png'])
48
  self.logger.info("OCR 處理支持已啟用")
49
 
50
- # 總是支持的類型
51
  self.supported_types.extend(['txt', 'md', 'xlsx', 'xls'])
52
  self.logger.info(f"文件處理器初始化完成,支持的類型: {self.supported_types}")
53
 
54
  def get_supported_types(self) -> list:
55
- """獲取支持的文件類型列表"""
56
  return self.supported_types
57
 
58
- def process_file(self, file_data: bytes, filename: str, file_type: str) -> Dict[str, Any]:
59
- """統一文件處理入口"""
60
- if file_type not in self.supported_types:
61
- error_msg = f"不支持的文件類型: {file_type}"
62
- self.logger.warning(error_msg)
63
- return {
64
- "text_content": error_msg,
65
- "file_type": file_type,
66
- "processed": False,
67
- "error": error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
70
- processors = {
71
- 'pdf': self._process_pdf,
72
- 'xlsx': self._process_excel,
73
- 'xls': self._process_excel,
74
- 'docx': self._process_word,
75
- 'doc': self._process_word,
76
- 'jpg': self._process_image,
77
- 'jpeg': self._process_image,
78
- 'png': self._process_image,
79
- 'txt': self._process_text,
80
- 'md': self._process_text
81
- }
82
 
83
- try:
84
- self.logger.info(f"開始處理文件: {filename}, 類型: {file_type}")
85
- result = processors[file_type](file_data, filename)
86
- self.logger.info(f"文件處理完成: {filename}, 狀態: {result.get('processed', False)}")
87
- return result
88
  except Exception as e:
89
- error_msg = f"處理文件時出錯: {str(e)}"
90
- self.logger.error(f"處理文件失敗 {filename}: {error_msg}")
91
- return {
92
- "text_content": error_msg,
93
- "file_type": file_type,
94
- "processed": False,
95
- "error": str(e)
96
- }
97
 
98
  def _process_pdf(self, file_data: bytes, filename: str) -> Dict[str, Any]:
99
- """處理 PDF 文件"""
100
  if not PDF_SUPPORT:
101
- return {
102
- "text_content": "PDF 處理不可用,請安裝 PyMuPDF",
103
- "file_type": "pdf",
104
- "processed": False
105
- }
106
-
107
- try:
108
- # 使用內存流處理 PDF,避免臨時文件
109
- doc = fitz.open(stream=file_data, filetype="pdf")
110
- text_content = ""
111
- page_count = len(doc)
112
-
113
- for page_num in range(page_count):
114
- page = doc[page_num]
115
- text_content += page.get_text()
116
-
117
- doc.close()
118
-
119
- result = {
120
- "text_content": text_content,
121
- "page_count": page_count,
122
- "file_type": "pdf",
123
- "processed": True,
124
- "content_length": len(text_content)
125
- }
126
-
127
- self.logger.info(f"PDF 處理成功: {filename}, 頁數: {page_count}, 內容長度: {len(text_content)}")
128
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- except Exception as e:
131
- self.logger.error(f"PDF 處理失敗 {filename}: {str(e)}")
132
- return {
133
- "text_content": f"PDF 處理失敗: {str(e)}",
134
- "file_type": "pdf",
135
- "processed": False,
136
- "error": str(e)
137
- }
138
 
139
  def _process_excel(self, file_data: bytes, filename: str) -> Dict[str, Any]:
140
- """處理 Excel 文件"""
141
  try:
142
- # 使用 BytesIO 避免臨時文件
143
  excel_file = BytesIO(file_data)
144
-
145
- # 讀取 Excel 文件
146
  if filename.endswith('xlsx'):
147
  df = pd.read_excel(excel_file, engine='openpyxl')
148
  else:
149
- df = pd.read_excel(excel_file) # .xls
150
-
151
- text_content = f"Excel 文件: {filename}\n"
152
- text_content += f"行數: {len(df)}, 列數: {len(df.columns)}\n\n"
153
  text_content += "列名: " + ", ".join(df.columns.astype(str)) + "\n\n"
154
-
155
- # 添加數據預覽(前5行)
156
- text_content += "數據預覽(前5行):\n"
157
- text_content += df.head().to_string()
158
-
159
- # 添加基本統計信息
160
  numeric_columns = df.select_dtypes(include=['number']).columns
161
  if not numeric_columns.empty:
162
  text_content += f"\n\n數值列統計:\n{df[numeric_columns].describe().to_string()}"
163
-
164
- result = {
165
- "text_content": text_content,
166
- "sheet_count": 1, # 簡化處理,只讀第一個 sheet
167
- "row_count": len(df),
168
- "column_count": len(df.columns),
169
- "file_type": "excel",
170
- "processed": True,
171
- "content_length": len(text_content)
172
- }
173
-
174
  self.logger.info(f"Excel 處理成功: {filename}, 行: {len(df)}, 列: {len(df.columns)}")
175
  return result
176
-
177
  except Exception as e:
178
  self.logger.error(f"Excel 處理失敗 {filename}: {str(e)}")
179
- return {
180
- "text_content": f"Excel 處理失敗: {str(e)}",
181
- "file_type": "excel",
182
- "processed": False,
183
- "error": str(e)
184
- }
185
 
186
  def _process_word(self, file_data: bytes, filename: str) -> Dict[str, Any]:
187
- """處理 Word 文件"""
188
- if not DOCX_SUPPORT:
189
- return {
190
- "text_content": "Word 處理不可用,請安裝 python-docx",
191
- "file_type": "word",
192
- "processed": False
193
- }
194
-
195
  try:
 
 
196
  doc = Document(BytesIO(file_data))
197
  text_content = ""
198
  paragraphs = []
199
-
200
  for paragraph in doc.paragraphs:
201
  if paragraph.text.strip():
202
  text_content += paragraph.text + "\n"
203
  paragraphs.append(paragraph.text)
204
-
205
- # 處理表格
206
  tables_text = ""
207
  for table in doc.tables:
208
  for row in table.rows:
@@ -211,87 +252,40 @@ class FileProcessor:
211
  row_text.append(cell.text.strip())
212
  tables_text += " | ".join(row_text) + "\n"
213
  tables_text += "\n"
214
-
215
  if tables_text:
216
  text_content += "\n表格內容:\n" + tables_text
217
-
218
- result = {
219
- "text_content": text_content,
220
- "paragraph_count": len(paragraphs),
221
- "table_count": len(doc.tables),
222
- "file_type": "word",
223
- "processed": True,
224
- "content_length": len(text_content)
225
- }
226
-
227
  self.logger.info(f"Word 處理成功: {filename}, 段落數: {len(paragraphs)}, 表格數: {len(doc.tables)}")
228
  return result
229
-
230
  except Exception as e:
231
  self.logger.error(f"Word 處理失敗 {filename}: {str(e)}")
232
- return {
233
- "text_content": f"Word 處理失敗: {str(e)}",
234
- "file_type": "word",
235
- "processed": False,
236
- "error": str(e)
237
- }
238
 
239
  def _process_image(self, file_data: bytes, filename: str) -> Dict[str, Any]:
240
- """處理圖片文件"""
241
  if not OCR_SUPPORT:
242
- return {
243
- "text_content": "OCR 處理不可用",
244
- "file_type": "image",
245
- "processed": False
246
- }
247
-
248
  try:
249
  image = Image.open(BytesIO(file_data))
250
-
251
- # 優化圖片以提高 OCR 準確率
252
  if image.mode != 'RGB':
253
  image = image.convert('RGB')
254
-
255
- # 調整圖片大小(如果過大)
256
  max_size = (2000, 2000)
257
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
258
-
259
- # OCR 識別
260
  text_content = pytesseract.image_to_string(image, lang='chi_sim+eng')
261
-
262
- result = {
263
- "text_content": text_content or "未識別到文字",
264
- "image_size": image.size,
265
- "file_type": "image",
266
- "processed": bool(text_content.strip()),
267
- "ocr_used": True,
268
- "content_length": len(text_content)
269
- }
270
-
271
  if text_content.strip():
272
  self.logger.info(f"圖片 OCR 成功: {filename}, 識別文字長度: {len(text_content)}")
273
  else:
274
  self.logger.warning(f"圖片 OCR 未識別到文字: {filename}")
275
-
276
  return result
277
-
278
  except Exception as e:
279
  self.logger.error(f"圖片處理失敗 {filename}: {str(e)}")
280
- return {
281
- "text_content": f"圖片處理失敗: {str(e)}",
282
- "file_type": "image",
283
- "processed": False,
284
- "error": str(e)
285
- }
286
 
287
  def _process_text(self, file_data: bytes, filename: str) -> Dict[str, Any]:
288
- """處理文本文件"""
289
  try:
290
- # 嘗試多種編碼
291
  encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'cp1252']
292
  text_content = None
293
  used_encoding = None
294
-
295
  for encoding in encodings:
296
  try:
297
  text_content = file_data.decode(encoding)
@@ -299,67 +293,33 @@ class FileProcessor:
299
  break
300
  except UnicodeDecodeError:
301
  continue
302
-
303
  if text_content is None:
304
- # 若所有編碼都失敗,使用忽略錯誤的方式
305
  text_content = file_data.decode('utf-8', errors='ignore')
306
  used_encoding = 'utf-8 (with errors ignored)'
307
-
308
- result = {
309
- "text_content": text_content,
310
- "file_type": "text",
311
- "processed": True,
312
- "encoding": used_encoding,
313
- "content_length": len(text_content)
314
- }
315
-
316
  self.logger.info(f"文本處理成功: {filename}, 編碼: {used_encoding}, 長度: {len(text_content)}")
317
  return result
318
-
319
  except Exception as e:
320
  self.logger.error(f"文本處理失敗 {filename}: {str(e)}")
321
- return {
322
- "text_content": f"文本處理失敗: {str(e)}",
323
- "file_type": "text",
324
- "processed": False,
325
- "error": str(e)
326
- }
327
 
328
  def get_file_info(self, file_data: bytes, filename: str) -> Dict[str, Any]:
329
- """獲取文件基本信息(不進行完整處理)"""
330
  file_type = filename.split('.')[-1].lower() if '.' in filename else 'unknown'
331
  file_size = len(file_data)
332
-
333
- info = {
334
- "filename": filename,
335
- "file_type": file_type,
336
- "file_size": file_size,
337
- "file_size_human": self._format_file_size(file_size),
338
- "supported": file_type in self.supported_types
339
- }
340
-
341
  return info
342
 
343
  def _format_file_size(self, size_bytes: int) -> str:
344
- """格式化文件大小"""
345
  if size_bytes == 0:
346
  return "0 B"
347
-
348
  size_names = ["B", "KB", "MB", "GB"]
349
  i = 0
350
  while size_bytes >= 1024 and i < len(size_names) - 1:
351
  size_bytes /= 1024.0
352
  i += 1
353
-
354
  return f"{size_bytes:.2f} {size_names[i]}"
355
 
356
-
357
- # 設置日誌
358
  def setup_logging():
359
- """設置日誌配置"""
360
- logging.basicConfig(
361
- level=logging.INFO,
362
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
363
- )
364
 
365
  setup_logging()
 
1
  # core/file_processor.py
2
  import os
3
  import tempfile
4
+ import subprocess
5
+ from typing import Dict, Any, List
6
  import pandas as pd
7
  from io import BytesIO
8
  import base64
9
  import logging
10
 
11
+ # Optional libs: prefer pymupdf (fitz), fallback to pypdf or pdftotext CLI
12
  try:
13
  import fitz # PyMuPDF
14
  PDF_SUPPORT = True
15
+ PDF_BACKEND = "pymupdf"
16
+ except Exception:
17
+ fitz = None
18
  PDF_SUPPORT = False
19
+ PDF_BACKEND = None
20
+
21
+ try:
22
+ import pypdf
23
+ if not PDF_SUPPORT:
24
+ PDF_SUPPORT = True
25
+ PDF_BACKEND = PDF_BACKEND or "pypdf"
26
+ except Exception:
27
+ pypdf = None
28
+
29
+ def _pdftotext_available() -> bool:
30
+ try:
31
+ subprocess.run(["pdftotext", "-v"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3)
32
+ return True
33
+ except Exception:
34
+ return False
35
+
36
+ PDFTOTEXT_CLI = _pdftotext_available()
37
+ if PDFTOTEXT_CLI and not PDF_SUPPORT:
38
+ PDF_SUPPORT = True
39
+ PDF_BACKEND = PDF_BACKEND or "pdftotext_cli"
40
 
41
  try:
42
  from docx import Document
43
  DOCX_SUPPORT = True
44
+ except Exception:
45
  DOCX_SUPPORT = False
 
46
 
47
  try:
48
  import pytesseract
49
  from PIL import Image
50
  OCR_SUPPORT = True
51
+ except Exception:
52
  OCR_SUPPORT = False
 
53
 
54
  class FileProcessor:
55
  def __init__(self):
56
  self.logger = logging.getLogger(__name__)
57
+ self.supported_types: List[str] = []
58
 
 
59
  if PDF_SUPPORT:
60
  self.supported_types.extend(['pdf'])
61
+ self.logger.info(f"PDF 處理支持已啟用 (backend={PDF_BACKEND})")
 
62
  if DOCX_SUPPORT:
63
  self.supported_types.extend(['docx', 'doc'])
64
  self.logger.info("Word 處理支持已啟用")
 
65
  if OCR_SUPPORT:
66
  self.supported_types.extend(['jpg', 'jpeg', 'png'])
67
  self.logger.info("OCR 處理支持已啟用")
68
 
 
69
  self.supported_types.extend(['txt', 'md', 'xlsx', 'xls'])
70
  self.logger.info(f"文件處理器初始化完成,支持的類型: {self.supported_types}")
71
 
72
  def get_supported_types(self) -> list:
 
73
  return self.supported_types
74
 
75
+ def process_file(self, file_input, filename: str = None, file_type: str = None) -> Dict[str, Any]:
76
+ """
77
+ Flexible process_file:
78
+ - Accepts bytes, path (str), file-like (has read), or dict with 'name'/'data'
79
+ - Infers filename and file_type if not provided
80
+ - Delegates to typed processors
81
+ """
82
+ try:
83
+ file_bytes = None
84
+
85
+ if isinstance(file_input, (bytes, bytearray)):
86
+ file_bytes = bytes(file_input)
87
+ elif isinstance(file_input, str) and os.path.exists(file_input):
88
+ with open(file_input, "rb") as f:
89
+ file_bytes = f.read()
90
+ if not filename:
91
+ filename = os.path.basename(file_input)
92
+ elif hasattr(file_input, "read"):
93
+ try:
94
+ file_input.seek(0)
95
+ except Exception:
96
+ pass
97
+ file_bytes = file_input.read()
98
+ if not filename:
99
+ filename = getattr(file_input, "name", None) or getattr(file_input, "filename", None)
100
+ elif isinstance(file_input, dict):
101
+ filename = filename or file_input.get("name") or file_input.get("filename")
102
+ data = file_input.get("data") or file_input.get("content") or file_input.get("bytes")
103
+ if isinstance(data, str):
104
+ try:
105
+ file_bytes = base64.b64decode(data)
106
+ except Exception:
107
+ file_bytes = data.encode()
108
+ elif isinstance(data, (bytes, bytearray)):
109
+ file_bytes = bytes(data)
110
+ else:
111
+ fobj = file_input.get("file")
112
+ if fobj and hasattr(fobj, "read"):
113
+ try:
114
+ fobj.seek(0)
115
+ except Exception:
116
+ pass
117
+ file_bytes = fobj.read()
118
+ else:
119
+ return {"text_content": "無法識別的 file_input 類型", "file_type": file_type or "unknown", "processed": False, "error": "unsupported_input"}
120
+
121
+ if not filename:
122
+ filename = "uploaded_file"
123
+ if not file_type:
124
+ file_type = os.path.splitext(filename)[1].lower().lstrip(".") or "unknown"
125
+ file_type = file_type.lower()
126
+
127
+ if file_type not in self.supported_types:
128
+ error_msg = f"不支持的文件類型: {file_type}"
129
+ self.logger.warning(error_msg)
130
+ return {"text_content": error_msg, "file_type": file_type, "processed": False, "error": error_msg}
131
+
132
+ processors = {
133
+ 'pdf': self._process_pdf,
134
+ 'xlsx': self._process_excel,
135
+ 'xls': self._process_excel,
136
+ 'docx': self._process_word,
137
+ 'doc': self._process_word,
138
+ 'jpg': self._process_image,
139
+ 'jpeg': self._process_image,
140
+ 'png': self._process_image,
141
+ 'txt': self._process_text,
142
+ 'md': self._process_text
143
  }
144
 
145
+ try:
146
+ self.logger.info(f"開始處理文件: {filename}, 類型: {file_type}")
147
+ result = processors[file_type](file_bytes, filename)
148
+ self.logger.info(f"文件處理完成: {filename}, 狀態: {result.get('processed', False)}")
149
+ return result
150
+ except Exception as e:
151
+ error_msg = f"處理文件時出錯: {str(e)}"
152
+ self.logger.error(f"處理文件失敗 {filename}: {error_msg}")
153
+ return {"text_content": error_msg, "file_type": file_type, "processed": False, "error": str(e)}
 
 
 
154
 
 
 
 
 
 
155
  except Exception as e:
156
+ self.logger.exception("process_file top-level exception")
157
+ return {"text_content": str(e), "file_type": file_type or "unknown", "processed": False, "error": str(e)}
 
 
 
 
 
 
158
 
159
  def _process_pdf(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
160
  if not PDF_SUPPORT:
161
+ return {"text_content": "PDF 處理不可用,請安裝 PyMuPDF 或 pypdf 或 pdftotext", "file_type": "pdf", "processed": False}
162
+
163
+ if PDF_BACKEND == "pymupdf" and fitz is not None:
164
+ try:
165
+ doc = fitz.open(stream=file_data, filetype="pdf")
166
+ text_content = ""
167
+ page_count = len(doc)
168
+ for page_num in range(page_count):
169
+ page = doc[page_num]
170
+ text_content += page.get_text()
171
+ doc.close()
172
+ return {"text_content": text_content, "page_count": page_count, "file_type": "pdf", "processed": True, "content_length": len(text_content)}
173
+ except Exception as e:
174
+ self.logger.warning(f"PyMuPDF extraction failed: {e}")
175
+
176
+ if pypdf is not None:
177
+ try:
178
+ reader = pypdf.PdfReader(BytesIO(file_data))
179
+ text_content = []
180
+ for p in reader.pages:
181
+ try:
182
+ text_content.append(p.extract_text() or "")
183
+ except Exception:
184
+ text_content.append("")
185
+ full = "\n".join(text_content)
186
+ return {"text_content": full, "page_count": len(reader.pages), "file_type": "pdf", "processed": True, "content_length": len(full)}
187
+ except Exception as e:
188
+ self.logger.warning(f"pypdf extraction failed: {e}")
189
+
190
+ if PDFTOTEXT_CLI:
191
+ tmp_path = None
192
+ try:
193
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
194
+ tmp.write(file_data)
195
+ tmp.flush()
196
+ tmp_path = tmp.name
197
+ out_txt = tmp_path + ".txt"
198
+ subprocess.run(["pdftotext", "-layout", tmp_path, out_txt], check=True, timeout=30)
199
+ text_content = ""
200
+ if os.path.exists(out_txt):
201
+ with open(out_txt, "r", encoding="utf-8", errors="ignore") as f:
202
+ text_content = f.read()
203
+ os.remove(out_txt)
204
+ os.remove(tmp_path)
205
+ return {"text_content": text_content, "page_count": None, "file_type": "pdf", "processed": True, "content_length": len(text_content)}
206
+ except Exception as e:
207
+ self.logger.warning(f"pdftotext CLI extraction failed: {e}")
208
+ try:
209
+ if tmp_path and os.path.exists(tmp_path):
210
+ os.remove(tmp_path)
211
+ except Exception:
212
+ pass
213
 
214
+ return {"text_content": "PDF 處理失敗: 無可用的解析後備方法", "file_type": "pdf", "processed": False, "error": "no_pdf_backend"}
 
 
 
 
 
 
 
215
 
216
  def _process_excel(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
217
  try:
 
218
  excel_file = BytesIO(file_data)
 
 
219
  if filename.endswith('xlsx'):
220
  df = pd.read_excel(excel_file, engine='openpyxl')
221
  else:
222
+ df = pd.read_excel(excel_file)
223
+ text_content = f"Excel 文件: {filename}\n行數: {len(df)}, 列數: {len(df.columns)}\n\n"
 
 
224
  text_content += "列名: " + ", ".join(df.columns.astype(str)) + "\n\n"
225
+ text_content += "數據預覽(前5行):\n" + df.head().to_string()
 
 
 
 
 
226
  numeric_columns = df.select_dtypes(include=['number']).columns
227
  if not numeric_columns.empty:
228
  text_content += f"\n\n數值列統計:\n{df[numeric_columns].describe().to_string()}"
229
+ result = {"text_content": text_content, "sheet_count": 1, "row_count": len(df), "column_count": len(df.columns), "file_type": "excel", "processed": True, "content_length": len(text_content)}
 
 
 
 
 
 
 
 
 
 
230
  self.logger.info(f"Excel 處理成功: {filename}, 行: {len(df)}, 列: {len(df.columns)}")
231
  return result
 
232
  except Exception as e:
233
  self.logger.error(f"Excel 處理失敗 {filename}: {str(e)}")
234
+ return {"text_content": f"Excel 處理失敗: {str(e)}", "file_type": "excel", "processed": False, "error": str(e)}
 
 
 
 
 
235
 
236
  def _process_word(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
237
  try:
238
+ if not DOCX_SUPPORT:
239
+ return {"text_content": "Word 處理不可用,請安裝 python-docx", "file_type": "word", "processed": False}
240
  doc = Document(BytesIO(file_data))
241
  text_content = ""
242
  paragraphs = []
 
243
  for paragraph in doc.paragraphs:
244
  if paragraph.text.strip():
245
  text_content += paragraph.text + "\n"
246
  paragraphs.append(paragraph.text)
 
 
247
  tables_text = ""
248
  for table in doc.tables:
249
  for row in table.rows:
 
252
  row_text.append(cell.text.strip())
253
  tables_text += " | ".join(row_text) + "\n"
254
  tables_text += "\n"
 
255
  if tables_text:
256
  text_content += "\n表格內容:\n" + tables_text
257
+ result = {"text_content": text_content, "paragraph_count": len(paragraphs), "table_count": len(doc.tables), "file_type": "word", "processed": True, "content_length": len(text_content)}
 
 
 
 
 
 
 
 
 
258
  self.logger.info(f"Word 處理成功: {filename}, 段落數: {len(paragraphs)}, 表格數: {len(doc.tables)}")
259
  return result
 
260
  except Exception as e:
261
  self.logger.error(f"Word 處理失敗 {filename}: {str(e)}")
262
+ return {"text_content": f"Word 處理失敗: {str(e)}", "file_type": "word", "processed": False, "error": str(e)}
 
 
 
 
 
263
 
264
  def _process_image(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
265
  if not OCR_SUPPORT:
266
+ return {"text_content": "OCR 處理不可用", "file_type": "image", "processed": False}
 
 
 
 
 
267
  try:
268
  image = Image.open(BytesIO(file_data))
 
 
269
  if image.mode != 'RGB':
270
  image = image.convert('RGB')
 
 
271
  max_size = (2000, 2000)
272
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
 
 
273
  text_content = pytesseract.image_to_string(image, lang='chi_sim+eng')
274
+ result = {"text_content": text_content or "未識別到文字", "image_size": image.size, "file_type": "image", "processed": bool(text_content.strip()), "ocr_used": True, "content_length": len(text_content)}
 
 
 
 
 
 
 
 
 
275
  if text_content.strip():
276
  self.logger.info(f"圖片 OCR 成功: {filename}, 識別文字長度: {len(text_content)}")
277
  else:
278
  self.logger.warning(f"圖片 OCR 未識別到文字: {filename}")
 
279
  return result
 
280
  except Exception as e:
281
  self.logger.error(f"圖片處理失敗 {filename}: {str(e)}")
282
+ return {"text_content": f"圖片處理失敗: {str(e)}", "file_type": "image", "processed": False, "error": str(e)}
 
 
 
 
 
283
 
284
  def _process_text(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
285
  try:
 
286
  encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'cp1252']
287
  text_content = None
288
  used_encoding = None
 
289
  for encoding in encodings:
290
  try:
291
  text_content = file_data.decode(encoding)
 
293
  break
294
  except UnicodeDecodeError:
295
  continue
 
296
  if text_content is None:
 
297
  text_content = file_data.decode('utf-8', errors='ignore')
298
  used_encoding = 'utf-8 (with errors ignored)'
299
+ result = {"text_content": text_content, "file_type": "text", "processed": True, "encoding": used_encoding, "content_length": len(text_content)}
 
 
 
 
 
 
 
 
300
  self.logger.info(f"文本處理成功: {filename}, 編碼: {used_encoding}, 長度: {len(text_content)}")
301
  return result
 
302
  except Exception as e:
303
  self.logger.error(f"文本處理失敗 {filename}: {str(e)}")
304
+ return {"text_content": f"文本處理失敗: {str(e)}", "file_type": "text", "processed": False, "error": str(e)}
 
 
 
 
 
305
 
306
  def get_file_info(self, file_data: bytes, filename: str) -> Dict[str, Any]:
 
307
  file_type = filename.split('.')[-1].lower() if '.' in filename else 'unknown'
308
  file_size = len(file_data)
309
+ info = {"filename": filename, "file_type": file_type, "file_size": file_size, "file_size_human": self._format_file_size(file_size), "supported": file_type in self.supported_types}
 
 
 
 
 
 
 
 
310
  return info
311
 
312
  def _format_file_size(self, size_bytes: int) -> str:
 
313
  if size_bytes == 0:
314
  return "0 B"
 
315
  size_names = ["B", "KB", "MB", "GB"]
316
  i = 0
317
  while size_bytes >= 1024 and i < len(size_names) - 1:
318
  size_bytes /= 1024.0
319
  i += 1
 
320
  return f"{size_bytes:.2f} {size_names[i]}"
321
 
 
 
322
  def setup_logging():
323
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
 
324
 
325
  setup_logging()