text_extractor.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. from pdfminer.high_level import extract_pages
  2. from pdfminer.layout import LTTextBoxHorizontal
  3. from pdfminer.pdfinterp import resolve1
  4. from pdfminer.pdfdocument import PDFDocument
  5. from pdfminer.pdfparser import PDFParser
  6. from matcher import Matcher
  7. from get_info import PdfExtractAttr, is_title
  8. from typing import Callable, Union, List, Tuple, Dict
  9. from tqdm import tqdm
  10. import pandas as pd
  11. # def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
  12. # """
  13. # 标题解析
  14. # Args:
  15. # pdf_path: PDF文件路径
  16. # Returns:
  17. # results
  18. # """
  19. # texts = []
  20. # for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
  21. # total=resolve1(PDFDocument(
  22. # PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
  23. # ):
  24. # title_index = 0
  25. # for element in page_layout:
  26. # if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  27. # text = element.get_text().strip()
  28. # if text and (is_title(text) or element.height > 15):
  29. # texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  30. # title_index += 1
  31. # results = []
  32. # for i, text in enumerate(texts):
  33. # results.append({'title': text['text'],
  34. # 'index': text['index'],
  35. # 'page_number': text['page_number'],
  36. # 'seq_num': i
  37. # })
  38. # return results
  39. def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
  40. if end is None:
  41. end = start + 1
  42. results = {}
  43. texts = []
  44. pages = set(range(start, end))
  45. for page in contents:
  46. if page['page_number'] in pages:
  47. results.get(int(page['page_number']), {}).update(
  48. {
  49. page['index']: {
  50. 'page_number': page['page_number'],
  51. 'index': page['index'],
  52. 'text': page['text'],
  53. 'lines': page['lines'],
  54. 'is_table_name': page['is_table_name']
  55. }
  56. })
  57. texts.append(page['text'])
  58. return results, texts
  59. def similarity_filter(data: List[dict], expect_similarity: float = None):
  60. def f(x: dict):
  61. return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5)
  62. return filter(f, data)
  63. def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  64. instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]:
  65. texts = ','.join(filter(lambda x: x != '',
  66. ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split(
  67. '。'))).split(',')
  68. sims = similar_match([{'text': i} for i in texts], instances, 'text')
  69. s_texts = [i['text'] for i in sims]
  70. similarities = [i['相似度'] for i in sims]
  71. if similarity is None:
  72. return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities
  73. else:
  74. return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities
  75. def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
  76. matcher = Matcher()
  77. df = pd.DataFrame(data)
  78. keyword_embeddings = matcher.get_embeddings(instances)
  79. tqdm.pandas(desc='标题相似度匹配')
  80. result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings))
  81. result.columns = ['因素', '相似度']
  82. df['因素'] = result['因素']
  83. df['相似度'] = result['相似度']
  84. max_sim_idx = df.groupby('因素')['相似度'].idxmax()
  85. max_sim_rows = df.loc[max_sim_idx]
  86. return max_sim_rows.to_dict(orient='records')
  87. def get_instance(title_instances: List[str],
  88. content_instances: List[str],
  89. pdf_path: str,
  90. extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  91. page_bias: int = 1,
  92. similarity: float = None
  93. ):
  94. """
  95. Args:
  96. title_instances
  97. content_instances
  98. file_path
  99. extractor
  100. page_bias
  101. similarity
  102. Returns:
  103. results
  104. """
  105. file = PdfExtractAttr(file_path=pdf_path)
  106. titles = file.parse_title()
  107. texts = file.parse_text()
  108. title_sims = similarity_filter(
  109. similar_match(
  110. titles,
  111. title_instances,
  112. key='title'
  113. ),
  114. similarity
  115. )
  116. results = []
  117. for i in title_sims:
  118. current_page = i['page_number']
  119. _, text = pagination_texts(texts, current_page, current_page + page_bias)
  120. results.extend(extract_from_texts(text, extractor, content_instances))
  121. return results
  122. if __name__ == '__main__':
  123. pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
  124. price_zhs = get_instance(
  125. title_instances=['投标函', '开标一览表'],
  126. content_instances=['人民币投标总报价'],
  127. pdf_path=pdf_path,
  128. extractor=match_price_zhs
  129. )
  130. # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
  131. # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  132. # match_price_num)
  133. # duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
  134. # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  135. # match_duration)
  136. # quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
  137. # '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  138. # match_quality)
  139. # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
  140. # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
  141. # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
  142. print(price_zhs)
  143. pass