text_extractor.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-08-30 11:17:21
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-03 10:23:35
  6. from typing import Callable, Union, List, Tuple, Dict, Optional
  7. import pandas as pd
  8. from tqdm import tqdm
  9. from celery_tasks.matcher import Matcher
  10. def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
  11. """
  12. """
  13. if end is None:
  14. end = start + 1
  15. results = {}
  16. texts = []
  17. pages = set(range(start, end))
  18. for page in contents:
  19. if page['page_number'] in pages:
  20. results.get(int(page['page_number']), {}).update(
  21. {
  22. page['index']: {
  23. 'page_number': page['page_number'],
  24. 'index': page['index'],
  25. 'text': page['text'],
  26. 'lines': page['lines'],
  27. 'is_table_name': page['is_table_name']
  28. }
  29. })
  30. texts.append(page['text'])
  31. return results, texts
  32. def similarity_filter(data: List[dict], expect_similarity: float = None):
  33. """
  34. """
  35. def f(x: dict):
  36. return x['相似度'] > (expect_similarity if isinstance(expect_similarity, float) else 0.5)
  37. return filter(f, data)
  38. def extract_from_texts(text: List[str], extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  39. instances: List[str], similarity: float = None) -> Tuple[List[str], List[int]]:
  40. texts = ','.join(filter(lambda x: x != '',
  41. ''.join([''.join(filter(lambda x: x != ' ', list(i.strip()))) for i in text]).split(
  42. '。'))).split(',')
  43. sims = similar_match([{'text': i} for i in texts], instances, 'text')
  44. s_texts = [i['text'] for i in sims]
  45. similarities = [i['相似度'] for i in sims]
  46. if similarity is None:
  47. return list(filter(lambda x: x != [], [extractor(i) for i in s_texts])), similarities
  48. else:
  49. return list(filter(lambda x: x != [], [extractor(i, similarity) for i in s_texts])), similarities
  50. def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
  51. """
  52. """
  53. matcher = Matcher()
  54. df = pd.DataFrame(data)
  55. keyword_embeddings = matcher.get_embeddings(instances)
  56. tqdm.pandas(desc='标题相似度匹配')
  57. result = df[key].apply(lambda x: matcher.TopK1(x, instances, matcher.get_embedding(x), keyword_embeddings))
  58. result.columns = ['因素', '相似度']
  59. df['因素'] = result['因素']
  60. df['相似度'] = result['相似度']
  61. max_sim_idx = df.groupby('因素')['相似度'].idxmax()
  62. max_sim_rows = df.loc[max_sim_idx]
  63. return max_sim_rows.to_dict(orient='records')
  64. def get_instance(title_instances: List[str],
  65. content_instances: List[str],
  66. extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
  67. titles_list: Optional[list] = None,
  68. texts_list: Optional[list] = None,
  69. pdf_path: Optional[str] = None,
  70. page_bias: int = 1,
  71. similarity: float = None):
  72. """
  73. Args:
  74. title_instances
  75. content_instances
  76. file_path
  77. extractor
  78. page_bias
  79. similarity
  80. Returns:
  81. results
  82. """
  83. title_sims = similarity_filter(
  84. similar_match(
  85. titles_list,
  86. title_instances,
  87. key='title'
  88. ),
  89. similarity
  90. )
  91. results = []
  92. for i in title_sims:
  93. current_page = i['page_number']
  94. _, text = pagination_texts(texts_list, current_page, current_page + page_bias)
  95. results.extend(extract_from_texts(text, extractor, content_instances))
  96. return results