1 rok pred · ace5c2b4fb
--- a/document_.py
+++ b/document_.py
@@ -23,13 +23,15 @@ chinese_num_map = {
 
				 
			
 
				 
			
 
				 
			
 
				-class DocumentPreReview():
			
 
				-    def __init__(self) -> None:
			
 
				+class DocumentPreReview:
			
 
				+    def __init__(self, table_path: str = 'all_tables.json') -> None:
			
 
				+        self.table_path = table_path
			
 
				+
			
 
				         self.bm = BaseMethods()
			
 
				         self.bidding_tables = self.get_bidding_table()
			
 
				-        self.contexts = self.get_contexts()
			
 
				-        self.announcement = self.get_announcement()
			
 
				-        self.bidding_context = self.get_bidding_context()
			
 
				+        # self.contexts = self.get_contexts()
			
 
				+        # self.announcement = self.get_announcement()
			
 
				+        # self.bidding_context = self.get_bidding_context()
			
 
				         self.chinese_num_map = chinese_num_map
			
 
				 
			
 
				     def get_contexts(self, file_path:str = 'data/contexts.json'):
			
@@ -41,9 +43,7 @@ class DocumentPreReview():
 
				     def get_bidding_table(self):
			
 
				         ''' get table data
			
 
				         '''
			
 
				-        file_path = "data/all_tables_三峡左右岸.json"
			
 
				-        # file_path = "code/bidding_document_extract/all_tables_三峡左右岸.json"
			
 
				-        all_tables = self.bm.json_read(file_path)
			
 
				+        all_tables = self.bm.json_read(self.table_path)
			
 
				         return all_tables
			
 
				     
			
 
				     def get_bidding_context(self):
			
@@ -76,12 +76,12 @@ class DocumentPreReview():
 
				             title_len = partial_form['title_len']
			
 
				             tables = partial_form["table"]
			
 
				             
			
 
				-            if '投标人须知前附表' == table_name:  
			
 
				-                record_page = page_number[0]
			
 
				-            if page_number[0] < record_page + 3: 
			
 
				-                for table in tables[1:]:
			
 
				-                    if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
			
 
				-                    if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
			
 
				+            # if '投标人须知前附表' == table_name:  
			
 
				+            #     record_page = page_number[0]
			
 
				+            # if page_number[0] < record_page + 3: 
			
 
				+            #     for table in tables[1:]:
			
 
				+            #         if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
			
 
				+            #         if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
			
 
				                 
			
 
				             if '评标方法' in table_name:
			
 
				                 table_name = table_name.strip().replace("\n","")
			
--- a/get_info.py
+++ b/get_info.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-08-01 13:43:01
			
 
				+# @Last Modified time: 2024-08-01 13:59:10
			
 
				 
			
 
				 # import os
			
 
				 
			
@@ -455,11 +455,12 @@ class PdfExtractAttr(object):
 
				         """尝试将表添加到结果列中，有两种情况，直接添加一个新表；拼接最后一个表
			
 
				         @table
			
 
				         """
			
 
				+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
			
 
				+
			
 
				         if new:
			
 
				             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				             return
			
 
				 
			
 
				-        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
			
 
				         tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
			
 
				         if len(table) > 1:
			
 
				             second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,6 @@ torch==2.3.0
 
				 scikit-learn==1.1.1
			
 
				 transformers==4.41.2
			
 
				 textrank4zh==0.3
			
 
				-jieba==0.42.1
			
 
				+jieba==0.42.1
			
 
				+camelot-py==0.11.0
			
 
				+PyMuPDF==1.24.9
			
--- a/tools.py
+++ b/tools.py
@@ -1,5 +1,9 @@
 
				+import os
			
 
				+import json
			
 
				 from enum import Enum, auto
			
 
				 from typing import Any, Optional
			
 
				+
			
 
				+import pandas as pd
			
 
				 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
			
 
				 from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
			
 
				 from pdfminer.pdfparser import PDFParser, PDFSyntaxError
			
@@ -98,4 +102,46 @@ class RefPageNumberResolver:
 
				         if ref_type is PDFRefType.NAMED_REF:
			
 
				             return self.resolve(self.document.get_dest(ref))
			
 
				 
			
 
				-        return None  # PDFRefType.UNK
			
 
				+        return None  # PDFRefType.UNK
			
 
				+
			
 
				+
			
 
				+class BaseMethods:
			
 
				+    ''' base methods class
			
 
				+    '''
			
 
				+    def __init__(self) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
			
 
				+        ''' 读取xls文件方法
			
 
				+        '''
			
 
				+        return pd.read_excel(file_path, sheet_name=sheetname)
			
 
				+
			
 
				+    def json_read(self, file_path: str):
			
 
				+        ''' 读取json文件方法
			
 
				+        '''
			
 
				+        with open(file_path, "r", encoding='utf-8') as fp:
			
 
				+            return json.load(fp)
			
 
				+        
			
 
				+    def save_file(self, save_data: list, save_path: str, file_format: str):
			
 
				+        ''' 保存文件
			
 
				+        '''
			
 
				+        if file_format == "json":
			
 
				+            with open(save_path,'w',encoding='utf-8') as sf:
			
 
				+                sf.write(json.dumps(save_data,ensure_ascii=False))
			
 
				+        elif file_format == "xlsx" or file_format == "xls":
			
 
				+            with pd.ExcelWriter(save_path) as fp:
			
 
				+                save_data.to_excel(fp, sheet_name="Sheet1")
			
 
				+        elif file_format == 'txt':
			
 
				+            with open(save_path, 'w', encoding='utf-8') as tx:
			
 
				+                for data in save_data:
			
 
				+                    tx.write(data+"\n")
			
 
				+    
			
 
				+    def traverse_file(self, dirpath: str):
			
 
				+        '''
			
 
				+        遍历文件夹下文件
			
 
				+        '''
			
 
				+        filename = tuple()
			
 
				+        for root, dir, files in os.walk(dirpath):
			
 
				+            for name in files:
			
 
				+                filename = filename.__add__((name,))
			
 
				+        return filename