|
@@ -5,46 +5,11 @@ from pdfminer.pdfdocument import PDFDocument
|
|
|
from pdfminer.pdfparser import PDFParser
|
|
|
from matcher import Matcher
|
|
|
from get_info import PdfExtractAttr, is_title
|
|
|
-from typing import Callable, Union, List, Tuple, Dict
|
|
|
+from typing import Callable, Union, List, Tuple, Dict, Optional
|
|
|
from tqdm import tqdm
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
-
|
|
|
-# def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
|
|
|
-# """
|
|
|
-# 标题解析
|
|
|
-
|
|
|
-# Args:
|
|
|
-# pdf_path: PDF文件路径
|
|
|
-
|
|
|
-# Returns:
|
|
|
-# results
|
|
|
-# """
|
|
|
-# texts = []
|
|
|
-
|
|
|
-# for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
|
|
|
-# total=resolve1(PDFDocument(
|
|
|
-# PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
|
|
|
-# ):
|
|
|
-# title_index = 0
|
|
|
-# for element in page_layout:
|
|
|
-# if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
|
|
|
-# text = element.get_text().strip()
|
|
|
-# if text and (is_title(text) or element.height > 15):
|
|
|
-# texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
|
|
|
-# title_index += 1
|
|
|
-# results = []
|
|
|
-
|
|
|
-# for i, text in enumerate(texts):
|
|
|
-# results.append({'title': text['text'],
|
|
|
-# 'index': text['index'],
|
|
|
-# 'page_number': text['page_number'],
|
|
|
-# 'seq_num': i
|
|
|
-# })
|
|
|
-# return results
|
|
|
-
|
|
|
-
|
|
|
def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
|
|
|
if end is None:
|
|
|
end = start + 1
|
|
@@ -106,10 +71,12 @@ def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
|
|
|
|
|
|
def get_instance(title_instances: List[str],
|
|
|
content_instances: List[str],
|
|
|
- pdf_path: str,
|
|
|
extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
|
|
|
+ titles_list: Optional[list] = None,
|
|
|
+ texts_list: Optional[list] = None,
|
|
|
+ pdf_path: Optional[str] = None,
|
|
|
page_bias: int = 1,
|
|
|
- similarity: float = None
|
|
|
+ similarity: float = None,
|
|
|
):
|
|
|
"""
|
|
|
Args:
|
|
@@ -123,9 +90,13 @@ def get_instance(title_instances: List[str],
|
|
|
Returns:
|
|
|
results
|
|
|
"""
|
|
|
- file = PdfExtractAttr(file_path=pdf_path)
|
|
|
- titles = file.parse_title()
|
|
|
- texts = file.parse_text()
|
|
|
+ if titles_list:
|
|
|
+ titles = titles_list
|
|
|
+ if texts_list:
|
|
|
+ texts = texts_list
|
|
|
+ # file = PdfExtractAttr(file_path=pdf_path)
|
|
|
+ # titles = file.parse_title()
|
|
|
+ # texts = file.parse_text()
|
|
|
|
|
|
title_sims = similarity_filter(
|
|
|
similar_match(
|
|
@@ -148,23 +119,40 @@ def get_instance(title_instances: List[str],
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
|
|
|
+ file = PdfExtractAttr(file_path=pdf_path)
|
|
|
+ titles = file.parse_title()
|
|
|
+ texts = file.parse_text()
|
|
|
+
|
|
|
price_zhs = get_instance(
|
|
|
title_instances=['投标函', '开标一览表'],
|
|
|
content_instances=['人民币投标总报价'],
|
|
|
- pdf_path=pdf_path,
|
|
|
+ titles_list=titles,
|
|
|
+ texts_list=texts,
|
|
|
extractor=match_price_zhs
|
|
|
)
|
|
|
- # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
|
|
|
- # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
- # match_price_num)
|
|
|
- # duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
|
|
|
- # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
- # match_duration)
|
|
|
- # quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
|
|
|
- # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
- # match_quality)
|
|
|
- # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
|
|
|
- # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
|
|
|
- # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
|
|
|
+ price_num = get_instance(
|
|
|
+ title_instances=['投标函', '开标一览表'],
|
|
|
+ content_instances=['人民币投标总报价'],
|
|
|
+ titles_list=titles,
|
|
|
+ texts_list=texts,
|
|
|
+ extractor=match_price_num
|
|
|
+ )
|
|
|
+ duration = get_instance(
|
|
|
+ title_instances=['投标函', '开标一览表'],
|
|
|
+ content_instances=['工期日历天'],
|
|
|
+ titles_list=titles,
|
|
|
+ texts_list=texts,
|
|
|
+ extractor=match_duration
|
|
|
+ )
|
|
|
+ quality = get_instance(
|
|
|
+ title_instances=['投标函', '开标一览表'],
|
|
|
+ content_instances=['工程质量'],
|
|
|
+ titles_list=titles,
|
|
|
+ texts_list=texts,
|
|
|
+ extractor=match_quality
|
|
|
+ )
|
|
|
+ valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
|
|
|
+ test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
|
|
|
+ valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
|
|
|
print(price_zhs)
|
|
|
pass
|