sprivacy 1 rok pred
rodič
commit
ace5c2b4fb
4 zmenil súbory, kde vykonal 67 pridanie a 18 odobranie
  1. 14 14
      document_.py
  2. 3 2
      get_info.py
  3. 3 1
      requirements.txt
  4. 47 1
      tools.py

+ 14 - 14
document_.py

@@ -23,13 +23,15 @@ chinese_num_map = {
 
 
 
-class DocumentPreReview():
-    def __init__(self) -> None:
+class DocumentPreReview:
+    def __init__(self, table_path: str = 'all_tables.json') -> None:
+        self.table_path = table_path
+
         self.bm = BaseMethods()
         self.bidding_tables = self.get_bidding_table()
-        self.contexts = self.get_contexts()
-        self.announcement = self.get_announcement()
-        self.bidding_context = self.get_bidding_context()
+        # self.contexts = self.get_contexts()
+        # self.announcement = self.get_announcement()
+        # self.bidding_context = self.get_bidding_context()
         self.chinese_num_map = chinese_num_map
 
     def get_contexts(self, file_path:str = 'data/contexts.json'):
@@ -41,9 +43,7 @@ class DocumentPreReview():
     def get_bidding_table(self):
         ''' get table data
         '''
-        file_path = "data/all_tables_三峡左右岸.json"
-        # file_path = "code/bidding_document_extract/all_tables_三峡左右岸.json"
-        all_tables = self.bm.json_read(file_path)
+        all_tables = self.bm.json_read(self.table_path)
         return all_tables
     
     def get_bidding_context(self):
@@ -76,12 +76,12 @@ class DocumentPreReview():
             title_len = partial_form['title_len']
             tables = partial_form["table"]
             
-            if '投标人须知前附表' == table_name:  
-                record_page = page_number[0]
-            if page_number[0] < record_page + 3: 
-                for table in tables[1:]:
-                    if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
-                    if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
+            # if '投标人须知前附表' == table_name:  
+            #     record_page = page_number[0]
+            # if page_number[0] < record_page + 3: 
+            #     for table in tables[1:]:
+            #         if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
+            #         if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
                 
             if '评标方法' in table_name:
                 table_name = table_name.strip().replace("\n","")

+ 3 - 2
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-08-01 13:43:01
+# @Last Modified time: 2024-08-01 13:59:10
 
 # import os
 
@@ -455,11 +455,12 @@ class PdfExtractAttr(object):
         """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
         @table
         """
+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
+
         if new:
             self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
             return
 
-        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
         tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
         if len(table) > 1:
             second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]

+ 3 - 1
requirements.txt

@@ -7,4 +7,6 @@ torch==2.3.0
 scikit-learn==1.1.1
 transformers==4.41.2
 textrank4zh==0.3
-jieba==0.42.1
+jieba==0.42.1
+camelot-py==0.11.0
+PyMuPDF==1.24.9

+ 47 - 1
tools.py

@@ -1,5 +1,9 @@
+import os
+import json
 from enum import Enum, auto
 from typing import Any, Optional
+
+import pandas as pd
 from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
 from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
 from pdfminer.pdfparser import PDFParser, PDFSyntaxError
@@ -98,4 +102,46 @@ class RefPageNumberResolver:
         if ref_type is PDFRefType.NAMED_REF:
             return self.resolve(self.document.get_dest(ref))
 
-        return None  # PDFRefType.UNK
+        return None  # PDFRefType.UNK
+
+
+class BaseMethods:
+    ''' base methods class
+    '''
+    def __init__(self) -> None:
+        pass
+
+    def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
+        ''' 读取xls文件方法
+        '''
+        return pd.read_excel(file_path, sheet_name=sheetname)
+
+    def json_read(self, file_path: str):
+        ''' 读取json文件方法
+        '''
+        with open(file_path, "r", encoding='utf-8') as fp:
+            return json.load(fp)
+        
+    def save_file(self, save_data: list, save_path: str, file_format: str):
+        ''' 保存文件
+        '''
+        if file_format == "json":
+            with open(save_path,'w',encoding='utf-8') as sf:
+                sf.write(json.dumps(save_data,ensure_ascii=False))
+        elif file_format == "xlsx" or file_format == "xls":
+            with pd.ExcelWriter(save_path) as fp:
+                save_data.to_excel(fp, sheet_name="Sheet1")
+        elif file_format == 'txt':
+            with open(save_path, 'w', encoding='utf-8') as tx:
+                for data in save_data:
+                    tx.write(data+"\n")
+    
+    def traverse_file(self, dirpath: str):
+        '''
+        遍历文件夹下文件
+        '''
+        filename = tuple()
+        for root, dir, files in os.walk(dirpath):
+            for name in files:
+                filename = filename.__add__((name,))
+        return filename