EGYADMIN commited on
Commit
8118598
·
verified ·
1 Parent(s): ab5871b

Update modules/document_processor.py

Browse files
Files changed (1) hide show
  1. modules/document_processor.py +1 -6
modules/document_processor.py CHANGED
@@ -12,7 +12,6 @@ import docx
12
  import PyPDF2
13
  import fitz # PyMuPDF
14
  import pdfplumber
15
- import mammoth
16
  from openpyxl import load_workbook
17
  from PIL import Image
18
  import pytesseract
@@ -125,10 +124,6 @@ class DocumentProcessor:
125
  try:
126
  doc = docx.Document(file_path)
127
  extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
128
- if not extracted_data["text"].strip():
129
- with open(file_path, "rb") as docx_file:
130
- result = mammoth.extract_raw_text(docx_file)
131
- extracted_data["text"] = result.value
132
  except Exception as e:
133
  extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
134
- return extracted_data
 
12
  import PyPDF2
13
  import fitz # PyMuPDF
14
  import pdfplumber
 
15
  from openpyxl import load_workbook
16
  from PIL import Image
17
  import pytesseract
 
124
  try:
125
  doc = docx.Document(file_path)
126
  extracted_data["text"] = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 
 
 
 
127
  except Exception as e:
128
  extracted_data["error"] = f"خطأ في معالجة ملف DOCX: {str(e)}"
129
+ return extracted_data