from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextBoxHorizontal from pdfminer.pdfinterp import resolve1 from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from matcher import Matcher from get_info import PdfExtractAttr, is_title from typing import Callable, Union, List, Tuple, Dict from tqdm import tqdm import pandas as pd # def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]: # """ # 标题解析 # Args: # pdf_path: PDF文件路径 # Returns: # results # """ # texts = [] # for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)), # total=resolve1(PDFDocument( # PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count'] # ): # title_index = 0 # for element in page_layout: # if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1: # text = element.get_text().strip() # if text and (is_title(text) or element.height > 15): # texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text}) # title_index += 1 # results = [] # for i, text in enumerate(texts): # results.append({'title': text['text'], # 'index': text['index'], # 'page_number': text['page_number'], # 'seq_num': i # }) # return results def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]: if end is None: end = start + 1 results = {} texts = [] pages = set(range(start, end)) for page in contents: if page['page_number'] in pages: results.get(int(page['page_number']), {}).update( { page['index']: { 'page_number': page['page_number'], 'index': page['index'], 'text': page['text'], 'lines': page['lines'], 'is_table_name': page['is_table_name'] } }) texts.append(page['text']) return results, texts def similarity_filter(data: List[dict], expect_similarity: float = None): def f(x: dict): return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5) return filter(f, data) def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]], instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]: texts = ','.join(filter(lambda x: x != '', ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split( '。'))).split(',') sims = similar_match([{'text': i} for i in texts], instances, 'text') s_texts = [i['text'] for i in sims] similarities = [i['相似度'] for i in sims] if similarity is None: return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities else: return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities def similar_match(data: List[dict], instances: List[str], key: str) -> {}: matcher = Matcher() df = pd.DataFrame(data) keyword_embeddings = matcher.get_embeddings(instances) tqdm.pandas(desc='标题相似度匹配') result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings)) result.columns = ['因素', '相似度'] df['因素'] = result['因素'] df['相似度'] = result['相似度'] max_sim_idx = df.groupby('因素')['相似度'].idxmax() max_sim_rows = df.loc[max_sim_idx] return max_sim_rows.to_dict(orient='records') def get_instance(title_instances: List[str], content_instances: List[str], pdf_path: str, extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]], page_bias: int = 1, similarity: float = None ): """ Args: title_instances content_instances file_path extractor page_bias similarity Returns: results """ file = PdfExtractAttr(file_path=pdf_path) titles = file.parse_title() texts = file.parse_text() title_sims = similarity_filter( similar_match( titles, title_instances, key='title' ), similarity ) results = [] for i in title_sims: current_page = i['page_number'] _, text = pagination_texts(texts, current_page, current_page + page_bias) results.extend(extract_from_texts(text, extractor, content_instances)) return results if __name__ == '__main__': pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf' price_zhs = get_instance( title_instances=['投标函', '开标一览表'], content_instances=['人民币投标总报价'], pdf_path=pdf_path, extractor=match_price_zhs ) # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'], # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', # match_price_num) # duration = get_instance(['投标函', '开标一览表'], ['工期日历天'], # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', # match_duration) # quality = get_instance(['投标函', '开标一览表'], ['工程质量'], # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf', # match_quality) # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:] # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元') # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36' print(price_zhs) pass