sprivacy 11 bulan lalu
induk
melakukan
a2865d6713
2 mengubah file dengan 65 tambahan dan 67 penghapusan
  1. 23 13
      extract_price.py
  2. 42 54
      text_extractor.py

+ 23 - 13
extract_price.py

@@ -60,37 +60,47 @@ def match_quality(text: str) -> List[str]:
 
 if __name__ == '__main__':
     from pprint import pprint
+    from get_info import PdfExtractAttr
+
+    pdf_path = r'./data/0预审查初审详审测试数据/三峡左岸及地下电站地坪整治/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
+    file = PdfExtractAttr(file_path=pdf_path)
+    titles = file.parse_title()
+    texts = file.parse_text()
 
-    pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
     price_zhs = get_instance(
         title_instances=['投标函', '开标一览表'],
         content_instances=['人民币投标总报价'],
-        pdf_path,
-        match_price_zhs
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_price_zhs
     )
     price_num = get_instance(
         title_instances=['投标函', '开标一览表'],
         content_instances=['人民币投标总报价'],
-        pdf_path,
-        match_price_num
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_price_num
     )
     duration = get_instance(
-        title_instances=['投标函', '开标一览表'],
+        title_instances=['投标函', '开标一览表'], 
         content_instances=['工期日历天'],
-        pdf_path,
-        match_duration
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_duration
     )
     quality = get_instance(
-        title_instances=['投标函', '开标一览表'],
+        title_instances=['投标函', '开标一览表'], 
         content_instances=['工程质量'],
-        pdf_path,
-        match_quality
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_quality
     )
-    valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
+    # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
+
     pprint({
         "price_zhs": price_zhs,
         "price_num": price_num,
         "duration": duration,
         "quality": quality,
-        "valid": valid
+        # "valid": valid
     })

+ 42 - 54
text_extractor.py

@@ -5,46 +5,11 @@ from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
 from matcher import Matcher
 from get_info import PdfExtractAttr, is_title
-from typing import Callable, Union, List, Tuple, Dict
+from typing import Callable, Union, List, Tuple, Dict, Optional
 from tqdm import tqdm
 import pandas as pd
 
 
-
-# def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
-#     """
-#     标题解析
-
-#     Args:
-#         pdf_path: PDF文件路径
-
-#     Returns:
-#         results
-#     """
-#     texts = []
-
-#     for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
-#                                          total=resolve1(PDFDocument(
-#                                              PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
-#                                          ):
-#         title_index = 0
-#         for element in page_layout:
-#             if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
-#                 text = element.get_text().strip()
-#                 if text and (is_title(text) or element.height > 15):
-#                     texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
-#                     title_index += 1
-#     results = []
-
-#     for i, text in enumerate(texts):
-#         results.append({'title': text['text'],
-#                         'index': text['index'],
-#                         'page_number': text['page_number'],
-#                         'seq_num': i
-#                         })
-#     return results
-
-
 def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
     if end is None:
         end = start + 1
@@ -106,10 +71,12 @@ def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
 
 def get_instance(title_instances: List[str],
                  content_instances: List[str],
-                 pdf_path: str,
                  extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
+                 titles_list: Optional[list] = None,
+                 texts_list: Optional[list] = None,
+                 pdf_path: Optional[str] = None,
                  page_bias: int = 1,
-                 similarity: float = None
+                 similarity: float = None,
                 ):
     """
     Args:
@@ -123,9 +90,13 @@ def get_instance(title_instances: List[str],
     Returns:
         results
     """
-    file = PdfExtractAttr(file_path=pdf_path)
-    titles = file.parse_title()
-    texts = file.parse_text()
+    if titles_list:
+        titles = titles_list
+    if texts_list:
+        texts = texts_list
+    # file = PdfExtractAttr(file_path=pdf_path)
+    # titles = file.parse_title()
+    # texts = file.parse_text()
 
     title_sims = similarity_filter(
         similar_match(
@@ -148,23 +119,40 @@ def get_instance(title_instances: List[str],
 
 if __name__ == '__main__':
     pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
+    file = PdfExtractAttr(file_path=pdf_path)
+    titles = file.parse_title()
+    texts = file.parse_text()
+
     price_zhs = get_instance(
         title_instances=['投标函', '开标一览表'],
         content_instances=['人民币投标总报价'],
-        pdf_path=pdf_path,
+        titles_list=titles,
+        texts_list=texts,
         extractor=match_price_zhs
     )
-    # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
-    #                          '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-    #                          match_price_num)
-    # duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
-    #                         '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-    #                         match_duration)
-    # quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
-    #                        '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-    #                        match_quality)
-    # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
-    # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
-    # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
+    price_num = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['人民币投标总报价'],
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_price_num
+    )
+    duration = get_instance(
+        title_instances=['投标函', '开标一览表'], 
+        content_instances=['工期日历天'],
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_duration
+    )
+    quality = get_instance(
+        title_instances=['投标函', '开标一览表'], 
+        content_instances=['工程质量'],
+        titles_list=titles,
+        texts_list=texts,
+        extractor=match_quality
+    )
+    valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
+    test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
+    valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
     print(price_zhs)
     pass