instance_locate.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-26 09:48:46
  6. from typing import List, Optional
  7. from celery_tasks.tools import filter_tables, filter_images, filter_content
  8. from celery_tasks.text_extractor import similarity_filter, similar_match
  9. def get_instances_by_title(title_list: List[dict], table_list: List[dict], image_list: List[dict], content_list: Optional[List[dict]] = None, instances: List[str] = ['近年财务状况表']):
  10. """
  11. 获取给定标题的所有表格和图表
  12. 大标题 outlines
  13. 小标题 text
  14. 表/图
  15. 1. 文字 + 表格(取第一行为标题)
  16. 2. 文字 + 图片(取第一行为标题)
  17. 3. 纯图片、表格(向上合并)
  18. Args:
  19. title_list: PDF 标题列表
  20. table_list: PDF 表格列表
  21. image_list: PDF 图片列表
  22. instances: 给定标题
  23. Returns:
  24. 返回列表,包含标题,索引,起始页,终止页,相似度,表格列表,图片列表
  25. """
  26. title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
  27. title_filter = [i for i in title_sims]
  28. results = []
  29. for item in title_filter:
  30. try:
  31. item['end_page'] = title_list[item['seq_num'] + 1]['page_number'] - 1
  32. if item['end_page'] <= item['page_number']:
  33. continue
  34. except IndexError:
  35. item['end_page'] = float('inf')
  36. item['tables'] = filter_tables(table_list, item['page_number'], item['end_page'])
  37. if image_list:
  38. item['images'] = filter_images(image_list, item['page_number'], item['end_page'])
  39. if content_list:
  40. item['content'] = filter_content(content_list, item['page_number'], item['end_page'])
  41. results.append(item)
  42. return results
  43. if __name__ == '__main__':
  44. import json
  45. from pprint import pprint
  46. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-title_n.json', 'r', encoding='utf-8') as fp:
  47. title_list = json.load(fp)
  48. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json', 'r', encoding='utf-8') as fp:
  49. table_list = json.load(fp)
  50. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-image.json', 'r', encoding='utf-8') as fp:
  51. image_list = json.load(fp)
  52. year = 2022
  53. instances = get_instances_by_title(
  54. title_list=title_list,
  55. table_list=table_list,
  56. image_list=image_list,
  57. instances=['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
  58. )
  59. pprint(instances)