# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-09-26 09:48:46 from typing import List, Optional from celery_tasks.tools import filter_tables, filter_images, filter_content from celery_tasks.text_extractor import similarity_filter, similar_match def get_instances_by_title(title_list: List[dict], table_list: List[dict], image_list: List[dict], content_list: Optional[List[dict]] = None, instances: List[str] = ['近年财务状况表']): """ 获取给定标题的所有表格和图表 大标题 outlines 小标题 text 表/图 1. 文字 + 表格(取第一行为标题) 2. 文字 + 图片(取第一行为标题) 3. 纯图片、表格(向上合并) Args: title_list: PDF 标题列表 table_list: PDF 表格列表 image_list: PDF 图片列表 instances: 给定标题 Returns: 返回列表,包含标题,索引,起始页,终止页,相似度,表格列表,图片列表 """ title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5) title_filter = [i for i in title_sims] results = [] for item in title_filter: try: item['end_page'] = title_list[item['seq_num'] + 1]['page_number'] - 1 if item['end_page'] <= item['page_number']: continue except IndexError: item['end_page'] = float('inf') item['tables'] = filter_tables(table_list, item['page_number'], item['end_page']) if image_list: item['images'] = filter_images(image_list, item['page_number'], item['end_page']) if content_list: item['content'] = filter_content(content_list, item['page_number'], item['end_page']) results.append(item) return results if __name__ == '__main__': import json from pprint import pprint with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-title_n.json', 'r', encoding='utf-8') as fp: title_list = json.load(fp) with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json', 'r', encoding='utf-8') as fp: table_list = json.load(fp) with open('D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-image.json', 'r', encoding='utf-8') as fp: image_list = json.load(fp) year = 2022 instances = get_instances_by_title( title_list=title_list, table_list=table_list, image_list=image_list, instances=['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)] ) pprint(instances)