瀏覽代碼

08/07/2024 14:16:24

sprivacy 1 年之前
父節點
當前提交
2cee030d8d
共有 2 個文件被更改,包括 98 次插入36 次删除
  1. 0 12
      document_.py
  2. 98 24
      get_info.py

+ 0 - 12
document_.py

@@ -203,8 +203,6 @@ class DocumentPreReview:
                     announcements += text["text"]
                 break
         return announcements
-    
-
 
     def formal_criteria(self, review_criteria_list:list):
         ''' Analysis of formal review criteria
@@ -287,7 +285,6 @@ class DocumentPreReview:
                 需要在投标文件中比对三个位置的报价总和值抽取
                 '''
                 pass
-    
 
     def qualification_criteria(self, review_criteria_list:list, bidder_know:dict):
         ''' Qualification assessment criteria
@@ -330,14 +327,6 @@ class DocumentPreReview:
                         需要设计prompt,可将内容及情况在线上glm4中使用,测出合适prompt
                         '''
 
-                        
-                        
-                
-                    
-
-
-
-
     def content_parsing(self):
         ''' data analysis aggregate function
         '''
@@ -347,7 +336,6 @@ class DocumentPreReview:
 
         # self.qualification_criteria(tag_dict['资格评审标准'], bidder_know)
 
-                    
 
 
 if __name__ == '__main__':

+ 98 - 24
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-08-06 17:16:15
+# @Last Modified time: 2024-08-07 13:44:47
 
 # import os
 
@@ -406,7 +406,6 @@ class PdfExtractAttr(object):
                         title_index += 1
                         self.outlines.at[idx, 'text'] += '\n'
                         self.outlines.at[idx, 'text'] += text
-                        print(self.outlines.iloc[idx]['text'])
 
                     # 正文部分
                     elif not current or self.can_merge_lines(current, element):# 可以合并
@@ -430,15 +429,78 @@ class PdfExtractAttr(object):
                             image_index += 1
 
 
-        with open(title_path, 'w', encoding='utf-8') as fp:
-            json.dump(texts, fp, indent=4, ensure_ascii=False)
+        if title_path:
+            with open(title_path, 'w', encoding='utf-8') as fp:
+                json.dump(texts, fp, indent=4, ensure_ascii=False)
 
-        self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
+        if section_path:
+            self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
 
-    def parse_outline(self) -> list:
-        """PDF大纲解析
+    def extract_toc(self) -> list:
+        """PDF大纲解析,依据内容解析
         """
         results = []
+
+        for page_number, page in enumerate(extract_pages(self.file_path)):
+
+            is_outline = False
+
+            if page_number < 1:
+                continue
+
+            if page_number > 20:
+                break
+
+            lines = [element.get_text().strip() for element in page if isinstance(element, LTTextBoxHorizontal)]
+
+            # 用于存储目录结构
+            # directory_structure = []
+
+            for line in lines:
+                # 检查是否符合目录格式
+                if line and '.' in line and (line[0].isdigit() or '\u4e00' <= line[0] <= '\u9fff') and line[-1].isdigit():
+                    is_outline = True
+                    # 计算缩进级别
+                    indent_level = 1
+                    # 获取内容
+                    title = re.findall('^[\d\.、]{0,}[\u4e00-\u9fff、()\s]+', line).pop()
+                    # 计算页码
+                    page_n = int(re.findall('\d+$', line).pop())
+                    # 添加到目录结构中
+                    # directory_structure.append({
+                    results.append({
+                        "level": indent_level,
+                        "title": title,
+                        "page_number": page_n
+                    })
+
+            # if directory_structure:
+            #     pprint(directory_structure)
+
+            if not is_outline:
+                break
+
+        return results
+
+    def extract_content(self, content_path: str = None) -> list:
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                self.content.append({
+                    'page_number': page.page_number - 1,
+                    'text': page.extract_text()
+                })
+
+        if content_path:
+            with open(content_path, 'w', encoding='utf-8') as fp:
+                json.dump(self.content, fp, indent=4, ensure_ascii=False)
+
+        return self.content
+
+    def parse_outline(self, outline_path: str = None) -> list:
+        """PDF大纲解析,依据元数据解析,解析失败则调用内容解析
+        """
+        results = []
+
         with open(self.file_path, "rb") as fp:
             try:
                 parser = PDFParser(fp)
@@ -464,8 +526,12 @@ class PdfExtractAttr(object):
             finally:
                 parser.close()
 
-        with open('outlines.json', 'w', encoding='utf-8') as op:
-            json.dump(results, op, indent=4, ensure_ascii=False)
+        if not results:
+            results = self.extract_toc()
+
+        if outline_path:
+            with open(outline_path, 'w', encoding='utf-8') as op:
+                json.dump(results, op, indent=4, ensure_ascii=False)
 
         self.outlines = pd.DataFrame(results)
 
@@ -540,7 +606,7 @@ class PdfExtractAttr(object):
         else:
             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
 
-    def parse_table_pro(self) -> None:
+    def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
         """表格解析
         """
         if self.detail_df == None:
@@ -551,14 +617,24 @@ class PdfExtractAttr(object):
                 # 查询是否存在表格
                 tables = page_layout.find_tables()
 
+                if not tables:
+                    continue
+
+                print(f"解析PDF{page_number}页的表格")
+
                 tables_pro = camelot.read_pdf(
                     self.file_path,
                     # flavor='stream',
                     pages=str(page_number+1),
                     # edge_tol=200,
                 )
+
+                if not tables_pro:
+                    continue
+
                 # 检测到该页面存在一个表格,对其进行合并判断
-                if len(tables) == 1:
+                if len(tables) == 1 and tables_pro:
+                    # print(f"解析PDF{page_number}页的表格")
                     table = tables[0]
                     table_pro = tables_pro[0].df.to_dict(orient='split')['data']
                     x0, y0, x1, y1 = table.bbox
@@ -576,24 +652,22 @@ class PdfExtractAttr(object):
                     for table_index in range(1, len(tables_pro)):
                         self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
 
-    def output(self, table_path: str = 'all_tables.json'):
-        """结果输出
-        """
-        with open(table_path, 'w', encoding='utf-8') as fp:
-            json.dump(self.tables, fp, indent=4, ensure_ascii=False)
+        if table_path:
+            with open(table_path, 'w', encoding='utf-8') as fp:
+                json.dump(self.tables, fp, indent=4, ensure_ascii=False)
 
         return self.tables
 
 
 if __name__ == '__main__':
-    # pdf_path = './投标文件-修改版9-5-1-1.pdf'
-    pdf_path = './南方电网数字研究院有限公司.pdf'
+    pdf_path = './投标文件-修改版9-5-1-1.pdf'
+    # pdf_path = './南方电网数字研究院有限公司.pdf'
     # pdf_path = './2022年度工程类-公招采购资料/2022-2025年度三峡电站9台机组检修密封加工制作重新招标/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
     # title_path = './投标文件-修改版9-5-1-1.json'
     # title_path = './投标文件-修改版9-5-1-1-title.json'
-    title_path = './南方电网数字研究院有限公司.json'
+    # title_path = './南方电网数字研究院有限公司.json'
     # section_path = './投标文件-修改版9-5-1-1-section.json'
-    section_path = './南方电网数字研究院有限公司-section.json'
+    # section_path = './南方电网数字研究院有限公司-section.json'
     # image_dir = './extracted_images'
     # os.makedirs(image_dir, exist_ok=True)
 
@@ -601,7 +675,7 @@ if __name__ == '__main__':
     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
 
     agent = PdfExtractAttr(file_path=pdf_path)
-    agent.parse_outline()
-    agent.main_parse(title_path=title_path, section_path=section_path)
-    # agent.parse_table_pro()
-    # agent.output('all_tables_pro.json')
+    # agent.parse_outline()
+    # agent.main_parse(title_path=title_path, section_path=section_path)
+    agent.parse_table_pro()
+    agent.output('all_tables_pro.json')