1 年之前 · 2cee030d8d
--- a/document_.py
+++ b/document_.py
@@ -203,8 +203,6 @@ class DocumentPreReview:
 
				                     announcements += text["text"]
			
 
				                 break
			
 
				         return announcements
			
 
				-    
			
 
				-
			
 
				 
			
 
				     def formal_criteria(self, review_criteria_list:list):
			
 
				         ''' Analysis of formal review criteria
			
@@ -287,7 +285,6 @@ class DocumentPreReview:
 
				                 需要在投标文件中比对三个位置的报价总和值抽取
			
 
				                 '''
			
 
				                 pass
			
 
				-    
			
 
				 
			
 
				     def qualification_criteria(self, review_criteria_list:list, bidder_know:dict):
			
 
				         ''' Qualification assessment criteria
			
@@ -330,14 +327,6 @@ class DocumentPreReview:
 
				                         需要设计prompt，可将内容及情况在线上glm4中使用，测出合适prompt
			
 
				                         '''
			
 
				 
			
 
				-                        
			
 
				-                        
			
 
				-                
			
 
				-                    
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				     def content_parsing(self):
			
 
				         ''' data analysis aggregate function
			
 
				         '''
			
@@ -347,7 +336,6 @@ class DocumentPreReview:
 
				 
			
 
				         # self.qualification_criteria(tag_dict['资格评审标准'], bidder_know)
			
 
				 
			
 
				-                    
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
--- a/get_info.py
+++ b/get_info.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-08-06 17:16:15
			
 
				+# @Last Modified time: 2024-08-07 13:44:47
			
 
				 
			
 
				 # import os
			
 
				 
			
@@ -406,7 +406,6 @@ class PdfExtractAttr(object):
 
				                         title_index += 1
			
 
				                         self.outlines.at[idx, 'text'] += '\n'
			
 
				                         self.outlines.at[idx, 'text'] += text
			
 
				-                        print(self.outlines.iloc[idx]['text'])
			
 
				 
			
 
				                     # 正文部分
			
 
				                     elif not current or self.can_merge_lines(current, element):# 可以合并
			
@@ -430,15 +429,78 @@ class PdfExtractAttr(object):
 
				                             image_index += 1
			
 
				 
			
 
				 
			
 
				-        with open(title_path, 'w', encoding='utf-8') as fp:
			
 
				-            json.dump(texts, fp, indent=4, ensure_ascii=False)
			
 
				+        if title_path:
			
 
				+            with open(title_path, 'w', encoding='utf-8') as fp:
			
 
				+                json.dump(texts, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				-        self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
			
 
				+        if section_path:
			
 
				+            self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
			
 
				 
			
 
				-    def parse_outline(self) -> list:
			
 
				-        """PDF大纲解析
			
 
				+    def extract_toc(self) -> list:
			
 
				+        """PDF大纲解析，依据内容解析
			
 
				         """
			
 
				         results = []
			
 
				+
			
 
				+        for page_number, page in enumerate(extract_pages(self.file_path)):
			
 
				+
			
 
				+            is_outline = False
			
 
				+
			
 
				+            if page_number < 1:
			
 
				+                continue
			
 
				+
			
 
				+            if page_number > 20:
			
 
				+                break
			
 
				+
			
 
				+            lines = [element.get_text().strip() for element in page if isinstance(element, LTTextBoxHorizontal)]
			
 
				+
			
 
				+            # 用于存储目录结构
			
 
				+            # directory_structure = []
			
 
				+
			
 
				+            for line in lines:
			
 
				+                # 检查是否符合目录格式
			
 
				+                if line and '.' in line and (line[0].isdigit() or '\u4e00' <= line[0] <= '\u9fff') and line[-1].isdigit():
			
 
				+                    is_outline = True
			
 
				+                    # 计算缩进级别
			
 
				+                    indent_level = 1
			
 
				+                    # 获取内容
			
 
				+                    title = re.findall('^[\d\.、]{0,}[\u4e00-\u9fff、（）\s]+', line).pop()
			
 
				+                    # 计算页码
			
 
				+                    page_n = int(re.findall('\d+$', line).pop())
			
 
				+                    # 添加到目录结构中
			
 
				+                    # directory_structure.append({
			
 
				+                    results.append({
			
 
				+                        "level": indent_level,
			
 
				+                        "title": title,
			
 
				+                        "page_number": page_n
			
 
				+                    })
			
 
				+
			
 
				+            # if directory_structure:
			
 
				+            #     pprint(directory_structure)
			
 
				+
			
 
				+            if not is_outline:
			
 
				+                break
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+    def extract_content(self, content_path: str = None) -> list:
			
 
				+        with pdfplumber.open(self.file_path) as pdf:
			
 
				+            for page in pdf.pages:
			
 
				+                self.content.append({
			
 
				+                    'page_number': page.page_number - 1,
			
 
				+                    'text': page.extract_text()
			
 
				+                })
			
 
				+
			
 
				+        if content_path:
			
 
				+            with open(content_path, 'w', encoding='utf-8') as fp:
			
 
				+                json.dump(self.content, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+        return self.content
			
 
				+
			
 
				+    def parse_outline(self, outline_path: str = None) -> list:
			
 
				+        """PDF大纲解析，依据元数据解析，解析失败则调用内容解析
			
 
				+        """
			
 
				+        results = []
			
 
				+
			
 
				         with open(self.file_path, "rb") as fp:
			
 
				             try:
			
 
				                 parser = PDFParser(fp)
			
@@ -464,8 +526,12 @@ class PdfExtractAttr(object):
 
				             finally:
			
 
				                 parser.close()
			
 
				 
			
 
				-        with open('outlines.json', 'w', encoding='utf-8') as op:
			
 
				-            json.dump(results, op, indent=4, ensure_ascii=False)
			
 
				+        if not results:
			
 
				+            results = self.extract_toc()
			
 
				+
			
 
				+        if outline_path:
			
 
				+            with open(outline_path, 'w', encoding='utf-8') as op:
			
 
				+                json.dump(results, op, indent=4, ensure_ascii=False)
			
 
				 
			
 
				         self.outlines = pd.DataFrame(results)
			
 
				 
			
@@ -540,7 +606,7 @@ class PdfExtractAttr(object):
 
				         else:
			
 
				             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
			
 
				 
			
 
				-    def parse_table_pro(self) -> None:
			
 
				+    def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
			
 
				         """表格解析
			
 
				         """
			
 
				         if self.detail_df == None:
			
@@ -551,14 +617,24 @@ class PdfExtractAttr(object):
 
				                 # 查询是否存在表格
			
 
				                 tables = page_layout.find_tables()
			
 
				 
			
 
				+                if not tables:
			
 
				+                    continue
			
 
				+
			
 
				+                print(f"解析PDF{page_number}页的表格")
			
 
				+
			
 
				                 tables_pro = camelot.read_pdf(
			
 
				                     self.file_path,
			
 
				                     # flavor='stream',
			
 
				                     pages=str(page_number+1),
			
 
				                     # edge_tol=200,
			
 
				                 )
			
 
				+
			
 
				+                if not tables_pro:
			
 
				+                    continue
			
 
				+
			
 
				                 # 检测到该页面存在一个表格，对其进行合并判断
			
 
				-                if len(tables) == 1:
			
 
				+                if len(tables) == 1 and tables_pro:
			
 
				+                    # print(f"解析PDF{page_number}页的表格")
			
 
				                     table = tables[0]
			
 
				                     table_pro = tables_pro[0].df.to_dict(orient='split')['data']
			
 
				                     x0, y0, x1, y1 = table.bbox
			
@@ -576,24 +652,22 @@ class PdfExtractAttr(object):
 
				                     for table_index in range(1, len(tables_pro)):
			
 
				                         self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
			
 
				 
			
 
				-    def output(self, table_path: str = 'all_tables.json'):
			
 
				-        """结果输出
			
 
				-        """
			
 
				-        with open(table_path, 'w', encoding='utf-8') as fp:
			
 
				-            json.dump(self.tables, fp, indent=4, ensure_ascii=False)
			
 
				+        if table_path:
			
 
				+            with open(table_path, 'w', encoding='utf-8') as fp:
			
 
				+                json.dump(self.tables, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				         return self.tables
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # pdf_path = './投标文件-修改版9-5-1-1.pdf'
			
 
				-    pdf_path = './南方电网数字研究院有限公司.pdf'
			
 
				+    pdf_path = './投标文件-修改版9-5-1-1.pdf'
			
 
				+    # pdf_path = './南方电网数字研究院有限公司.pdf'
			
 
				     # pdf_path = './2022年度工程类-公招采购资料/2022-2025年度三峡电站9台机组检修密封加工制作重新招标/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
			
 
				     # title_path = './投标文件-修改版9-5-1-1.json'
			
 
				     # title_path = './投标文件-修改版9-5-1-1-title.json'
			
 
				-    title_path = './南方电网数字研究院有限公司.json'
			
 
				+    # title_path = './南方电网数字研究院有限公司.json'
			
 
				     # section_path = './投标文件-修改版9-5-1-1-section.json'
			
 
				-    section_path = './南方电网数字研究院有限公司-section.json'
			
 
				+    # section_path = './南方电网数字研究院有限公司-section.json'
			
 
				     # image_dir = './extracted_images'
			
 
				     # os.makedirs(image_dir, exist_ok=True)
			
 
				 
			
@@ -601,7 +675,7 @@ if __name__ == '__main__':
 
				     # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
			
 
				 
			
 
				     agent = PdfExtractAttr(file_path=pdf_path)
			
 
				-    agent.parse_outline()
			
 
				-    agent.main_parse(title_path=title_path, section_path=section_path)
			
 
				-    # agent.parse_table_pro()
			
 
				-    # agent.output('all_tables_pro.json')
			
 
				+    # agent.parse_outline()
			
 
				+    # agent.main_parse(title_path=title_path, section_path=section_path)
			
 
				+    agent.parse_table_pro()
			
 
				+    agent.output('all_tables_pro.json')