Spaces:
Sleeping
Sleeping
Update modules/document_processor.py
Browse files
modules/document_processor.py
CHANGED
@@ -12,7 +12,6 @@ import docx
|
|
12 |
import PyPDF2
|
13 |
import fitz # PyMuPDF
|
14 |
import pdfplumber
|
15 |
-
import mammoth
|
16 |
from openpyxl import load_workbook
|
17 |
from PIL import Image
|
18 |
import pytesseract
|
@@ -125,10 +124,6 @@ class DocumentProcessor:
|
|
125 |
try:
|
126 |
doc = docx.Document(file_path)
|
127 |
extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
128 |
-
if not extracted_data["text"].strip():
|
129 |
-
with open(file_path, "rb") as docx_file:
|
130 |
-
result = mammoth.extract_raw_text(docx_file)
|
131 |
-
extracted_data["text"] = result.value
|
132 |
except Exception as e:
|
133 |
extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
|
134 |
-
return extracted_data
|
|
|
12 |
import PyPDF2
|
13 |
import fitz # PyMuPDF
|
14 |
import pdfplumber
|
|
|
15 |
from openpyxl import load_workbook
|
16 |
from PIL import Image
|
17 |
import pytesseract
|
|
|
124 |
try:
|
125 |
doc = docx.Document(file_path)
|
126 |
extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
|
|
|
|
|
|
|
|
127 |
except Exception as e:
|
128 |
extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
|
129 |
+
return extracted_data
|