|
@@ -0,0 +1,136 @@
|
|
|
+from pdfminer.high_level import extract_pages
|
|
|
+from pdfminer.layout import LTTextBoxHorizontal
|
|
|
+from pdfminer.pdfinterp import resolve1
|
|
|
+from pdfminer.pdfdocument import PDFDocument
|
|
|
+from pdfminer.pdfparser import PDFParser
|
|
|
+from matcher import Matcher
|
|
|
+from get_info import PdfExtractAttr, is_title
|
|
|
+from typing import Callable, Union, List, Tuple, Dict
|
|
|
+from re import fullmatch
|
|
|
+from tqdm import tqdm
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+
|
|
|
+def absolute_not_title(line: str) -> bool:
|
|
|
+ if fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
|
|
|
+ texts = []
|
|
|
+ for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
|
|
|
+ total=resolve1(PDFDocument(
|
|
|
+ PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
|
|
|
+ ):
|
|
|
+ title_index = 0
|
|
|
+ for element in page_layout:
|
|
|
+ if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
|
|
|
+ text = element.get_text().strip()
|
|
|
+ if text and (is_title(text) or element.height > 15) and (not absolute_not_title(text)):
|
|
|
+ texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
|
|
|
+ title_index += 1
|
|
|
+ results = []
|
|
|
+ for i, text in enumerate(texts):
|
|
|
+ results.append({'title': text['text'],
|
|
|
+ 'index': text['index'],
|
|
|
+ 'page_number': text['page_number'],
|
|
|
+ 'seq_num': i
|
|
|
+ })
|
|
|
+ return results
|
|
|
+
|
|
|
+
|
|
|
+def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
|
|
|
+ if end is None:
|
|
|
+ end = start + 1
|
|
|
+ results = {}
|
|
|
+ texts = []
|
|
|
+ pages = set(range(start, end))
|
|
|
+ for page in contents:
|
|
|
+ if page['page_number'] in pages:
|
|
|
+ results.get(int(page['page_number']), {}).update(
|
|
|
+ {
|
|
|
+ page['index']: {
|
|
|
+ 'page_number': page['page_number'],
|
|
|
+ 'index': page['index'],
|
|
|
+ 'text': page['text'],
|
|
|
+ 'lines': page['lines'],
|
|
|
+ 'is_table_name': page['is_table_name']
|
|
|
+ }
|
|
|
+ })
|
|
|
+ texts.append(page['text'])
|
|
|
+ return results, texts
|
|
|
+
|
|
|
+
|
|
|
+def similarity_filter(data: List[dict], expect_similarity: float = None):
|
|
|
+ def f(x: dict):
|
|
|
+ return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5)
|
|
|
+
|
|
|
+ return filter(f, data)
|
|
|
+
|
|
|
+
|
|
|
+def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
|
|
|
+ instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]:
|
|
|
+ texts = ','.join(filter(lambda x: x != '',
|
|
|
+ ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split(
|
|
|
+ '。'))).split(',')
|
|
|
+ sims = similar_match([{'text': i} for i in texts], instances, 'text')
|
|
|
+ s_texts = [i['text'] for i in sims]
|
|
|
+ similarities = [i['相似度'] for i in sims]
|
|
|
+ if similarity is None:
|
|
|
+ return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities
|
|
|
+ else:
|
|
|
+ return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities
|
|
|
+
|
|
|
+
|
|
|
+def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
|
|
|
+ matcher = Matcher()
|
|
|
+ df = pd.DataFrame(data)
|
|
|
+ keyword_embeddings = matcher.get_embeddings(instances)
|
|
|
+ tqdm.pandas(desc='标题相似度匹配')
|
|
|
+ result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings))
|
|
|
+ result.columns = ['因素', '相似度']
|
|
|
+
|
|
|
+ df['因素'] = result['因素']
|
|
|
+ df['相似度'] = result['相似度']
|
|
|
+
|
|
|
+ max_sim_idx = df.groupby('因素')['相似度'].idxmax()
|
|
|
+ max_sim_rows = df.loc[max_sim_idx]
|
|
|
+ return max_sim_rows.to_dict(orient='records')
|
|
|
+
|
|
|
+
|
|
|
+def get_instance(title_instances: List[str], content_instances: List[str], pdf: str,
|
|
|
+ extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
|
|
|
+ page_bias: int = 1, similarity: float = None):
|
|
|
+ file = PdfExtractAttr(file_path=pdf)
|
|
|
+ # titles = file.parse_outline()
|
|
|
+ titles = parse_title(pdf)
|
|
|
+ texts = file.parse_text()
|
|
|
+
|
|
|
+ title_sims = similarity_filter(similar_match(titles, title_instances, key='title'), similarity)
|
|
|
+ results = []
|
|
|
+ for i in title_sims:
|
|
|
+ current_page = i['page_number']
|
|
|
+ _, text = pagination_texts(texts, current_page, current_page + page_bias)
|
|
|
+ results.extend(extract_from_texts(text, extractor, content_instances))
|
|
|
+ return results
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
|
|
|
+ # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
+ # match_price_zhs)
|
|
|
+ # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
|
|
|
+ # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
+ # match_price_num)
|
|
|
+ # duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
|
|
|
+ # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
+ # match_duration)
|
|
|
+ # quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
|
|
|
+ # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
|
|
|
+ # match_quality)
|
|
|
+ # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
|
|
|
+ # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
|
|
|
+ # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
|
|
|
+ pass
|