123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-09-26 09:48:46
- from typing import List, Optional
- from celery_tasks.tools import filter_tables, filter_images, filter_content
- from celery_tasks.text_extractor import similarity_filter, similar_match
- def get_instances_by_title(title_list: List[dict], table_list: List[dict], image_list: List[dict], content_list: Optional[List[dict]] = None, instances: List[str] = ['近年财务状况表']):
- """
- 获取给定标题的所有表格和图表
- 大标题 outlines
- 小标题 text
- 表/图
- 1. 文字 + 表格(取第一行为标题)
- 2. 文字 + 图片(取第一行为标题)
- 3. 纯图片、表格(向上合并)
- Args:
- title_list: PDF 标题列表
- table_list: PDF 表格列表
- image_list: PDF 图片列表
- instances: 给定标题
- Returns:
- 返回列表,包含标题,索引,起始页,终止页,相似度,表格列表,图片列表
- """
- title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
- title_filter = [i for i in title_sims]
- results = []
- for item in title_filter:
- try:
- item['end_page'] = title_list[item['seq_num'] + 1]['page_number'] - 1
- if item['end_page'] <= item['page_number']:
- continue
- except IndexError:
- item['end_page'] = float('inf')
- item['tables'] = filter_tables(table_list, item['page_number'], item['end_page'])
- if image_list:
- item['images'] = filter_images(image_list, item['page_number'], item['end_page'])
- if content_list:
- item['content'] = filter_content(content_list, item['page_number'], item['end_page'])
- results.append(item)
- return results
- if __name__ == '__main__':
- import json
- from pprint import pprint
- with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-title_n.json', 'r', encoding='utf-8') as fp:
- title_list = json.load(fp)
- with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json', 'r', encoding='utf-8') as fp:
- table_list = json.load(fp)
- with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-image.json', 'r', encoding='utf-8') as fp:
- image_list = json.load(fp)
- year = 2022
- instances = get_instances_by_title(
- title_list=title_list,
- table_list=table_list,
- image_list=image_list,
- instances=['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
- )
- pprint(instances)
|