123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- from pdfminer.high_level import extract_pages
- from pdfminer.layout import LTTextBoxHorizontal
- from pdfminer.pdfinterp import resolve1
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfparser import PDFParser
- from matcher import Matcher
- from get_info import PdfExtractAttr, is_title
- from typing import Callable, Union, List, Tuple, Dict, Optional
- from tqdm import tqdm
- import pandas as pd
- def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
- if end is None:
- end = start + 1
- results = {}
- texts = []
- pages = set(range(start, end))
- for page in contents:
- if page['page_number'] in pages:
- results.get(int(page['page_number']), {}).update(
- {
- page['index']: {
- 'page_number': page['page_number'],
- 'index': page['index'],
- 'text': page['text'],
- 'lines': page['lines'],
- 'is_table_name': page['is_table_name']
- }
- })
- texts.append(page['text'])
- return results, texts
- def similarity_filter(data: List[dict], expect_similarity: float = None):
- def f(x: dict):
- return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5)
- return filter(f, data)
- def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
- instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]:
- texts = ','.join(filter(lambda x: x != '',
- ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split(
- '。'))).split(',')
- sims = similar_match([{'text': i} for i in texts], instances, 'text')
- s_texts = [i['text'] for i in sims]
- similarities = [i['相似度'] for i in sims]
- if similarity is None:
- return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities
- else:
- return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities
- def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
- matcher = Matcher()
- df = pd.DataFrame(data)
- keyword_embeddings = matcher.get_embeddings(instances)
- tqdm.pandas(desc='标题相似度匹配')
- result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings))
- result.columns = ['因素', '相似度']
- df['因素'] = result['因素']
- df['相似度'] = result['相似度']
- max_sim_idx = df.groupby('因素')['相似度'].idxmax()
- max_sim_rows = df.loc[max_sim_idx]
- return max_sim_rows.to_dict(orient='records')
- def get_instance(title_instances: List[str],
- content_instances: List[str],
- extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
- titles_list: Optional[list] = None,
- texts_list: Optional[list] = None,
- pdf_path: Optional[str] = None,
- page_bias: int = 1,
- similarity: float = None,
- ):
- """
- Args:
- title_instances
- content_instances
- file_path
- extractor
- page_bias
- similarity
- Returns:
- results
- """
- if titles_list:
- titles = titles_list
- if texts_list:
- texts = texts_list
- # file = PdfExtractAttr(file_path=pdf_path)
- # titles = file.parse_title()
- # texts = file.parse_text()
- title_sims = similarity_filter(
- similar_match(
- titles,
- title_instances,
- key='title'
- ),
- similarity
- )
- results = []
- for i in title_sims:
- current_page = i['page_number']
- _, text = pagination_texts(texts, current_page, current_page + page_bias)
- results.extend(extract_from_texts(text, extractor, content_instances))
- return results
- if __name__ == '__main__':
- pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
- file = PdfExtractAttr(file_path=pdf_path)
- titles = file.parse_title()
- texts = file.parse_text()
- price_zhs = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['人民币投标总报价'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_price_zhs
- )
- price_num = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['人民币投标总报价'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_price_num
- )
- duration = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['工期日历天'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_duration
- )
- quality = get_instance(
- title_instances=['投标函', '开标一览表'],
- content_instances=['工程质量'],
- titles_list=titles,
- texts_list=texts,
- extractor=match_quality
- )
- valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
- test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
- valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
- print(price_zhs)
- pass
|