instance_locate.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-03 10:17:47
  6. from typing import List
  7. from celery_tasks.tools import filter_tables, filter_images
  8. from celery_tasks.text_extractor import similarity_filter, similar_match
  9. def get_instances_by_title(title_list: List[dict], table_list: List[dict], image_list: List[dict], instances: List[str] = ['近年财务状况表']):
  10. """
  11. 获取给定标题的所有表格和图表
  12. 大标题 outlines
  13. 小标题 text
  14. 表/图
  15. 1. 文字 + 表格(取第一行为标题)
  16. 2. 文字 + 图片(取第一行为标题)
  17. 3. 纯图片、表格(向上合并)
  18. Args:
  19. title_list: PDF 标题列表
  20. table_list: PDF 表格列表
  21. image_list: PDF 图片列表
  22. instances: 给定标题
  23. Returns:
  24. 返回列表,包含标题,索引,起始页,终止页,相似度,表格列表,图片列表
  25. """
  26. title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
  27. title_filter = [i for i in title_sims]
  28. results = []
  29. for item in title_filter:
  30. try:
  31. item['end_page'] = title_list[item['seq_num'] + 1]['page_number'] - 1
  32. if item['end_page'] <= item['page_number']:
  33. continue
  34. except IndexError:
  35. item['end_page'] = float('inf')
  36. item['tables'] = filter_tables(table_list, item['page_number'], item['end_page'])
  37. item['images'] = filter_images(image_list, item['page_number'], item['end_page'])
  38. results.append(item)
  39. return results
  40. if __name__ == '__main__':
  41. import json
  42. from pprint import pprint
  43. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-title_n.json', 'r', encoding='utf-8') as fp:
  44. title_list = json.load(fp)
  45. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json', 'r', encoding='utf-8') as fp:
  46. table_list = json.load(fp)
  47. with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-image.json', 'r', encoding='utf-8') as fp:
  48. image_list = json.load(fp)
  49. year = 2022
  50. instances = get_instances_by_title(
  51. title_list=title_list,
  52. table_list=table_list,
  53. image_list=image_list,
  54. instances=['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
  55. )
  56. pprint(instances)