text_extractor.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. from pdfminer.high_level import extract_pages
  2. from pdfminer.layout import LTTextBoxHorizontal
  3. from pdfminer.pdfinterp import resolve1
  4. from pdfminer.pdfdocument import PDFDocument
  5. from pdfminer.pdfparser import PDFParser
  6. from matcher import Matcher
  7. from get_info import PdfExtractAttr, is_title
  8. from typing import Callable, Union, List, Tuple, Dict, Optional
  9. from tqdm import tqdm
  10. import pandas as pd
  11. def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
  12. if end is None:
  13. end = start + 1
  14. results = {}
  15. texts = []
  16. pages = set(range(start, end))
  17. for page in contents:
  18. if page['page_number'] in pages:
  19. results.get(int(page['page_number']), {}).update(
  20. {
  21. page['index']: {
  22. 'page_number': page['page_number'],
  23. 'index': page['index'],
  24. 'text': page['text'],
  25. 'lines': page['lines'],
  26. 'is_table_name': page['is_table_name']
  27. }
  28. })
  29. texts.append(page['text'])
  30. return results, texts
  31. def similarity_filter(data: List[dict], expect_similarity: float = None):
  32. def f(x: dict):
  33. return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5)
  34. return filter(f, data)
  35. def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  36. instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]:
  37. texts = ','.join(filter(lambda x: x != '',
  38. ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split(
  39. '。'))).split(',')
  40. sims = similar_match([{'text': i} for i in texts], instances, 'text')
  41. s_texts = [i['text'] for i in sims]
  42. similarities = [i['相似度'] for i in sims]
  43. if similarity is None:
  44. return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities
  45. else:
  46. return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities
  47. def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
  48. matcher = Matcher()
  49. df = pd.DataFrame(data)
  50. keyword_embeddings = matcher.get_embeddings(instances)
  51. tqdm.pandas(desc='标题相似度匹配')
  52. result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings))
  53. result.columns = ['因素', '相似度']
  54. df['因素'] = result['因素']
  55. df['相似度'] = result['相似度']
  56. max_sim_idx = df.groupby('因素')['相似度'].idxmax()
  57. max_sim_rows = df.loc[max_sim_idx]
  58. return max_sim_rows.to_dict(orient='records')
  59. def get_instance(title_instances: List[str],
  60. content_instances: List[str],
  61. extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  62. titles_list: Optional[list] = None,
  63. texts_list: Optional[list] = None,
  64. pdf_path: Optional[str] = None,
  65. page_bias: int = 1,
  66. similarity: float = None,
  67. ):
  68. """
  69. Args:
  70. title_instances
  71. content_instances
  72. file_path
  73. extractor
  74. page_bias
  75. similarity
  76. Returns:
  77. results
  78. """
  79. if titles_list:
  80. titles = titles_list
  81. if texts_list:
  82. texts = texts_list
  83. # file = PdfExtractAttr(file_path=pdf_path)
  84. # titles = file.parse_title()
  85. # texts = file.parse_text()
  86. title_sims = similarity_filter(
  87. similar_match(
  88. titles,
  89. title_instances,
  90. key='title'
  91. ),
  92. similarity
  93. )
  94. results = []
  95. for i in title_sims:
  96. current_page = i['page_number']
  97. _, text = pagination_texts(texts, current_page, current_page + page_bias)
  98. results.extend(extract_from_texts(text, extractor, content_instances))
  99. return results
  100. if __name__ == '__main__':
  101. pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
  102. file = PdfExtractAttr(file_path=pdf_path)
  103. titles = file.parse_title()
  104. texts = file.parse_text()
  105. price_zhs = get_instance(
  106. title_instances=['投标函', '开标一览表'],
  107. content_instances=['人民币投标总报价'],
  108. titles_list=titles,
  109. texts_list=texts,
  110. extractor=match_price_zhs
  111. )
  112. price_num = get_instance(
  113. title_instances=['投标函', '开标一览表'],
  114. content_instances=['人民币投标总报价'],
  115. titles_list=titles,
  116. texts_list=texts,
  117. extractor=match_price_num
  118. )
  119. duration = get_instance(
  120. title_instances=['投标函', '开标一览表'],
  121. content_instances=['工期日历天'],
  122. titles_list=titles,
  123. texts_list=texts,
  124. extractor=match_duration
  125. )
  126. quality = get_instance(
  127. title_instances=['投标函', '开标一览表'],
  128. content_instances=['工程质量'],
  129. titles_list=titles,
  130. texts_list=texts,
  131. extractor=match_quality
  132. )
  133. valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
  134. test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
  135. valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
  136. print(price_zhs)
  137. pass