# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-09-03 16:03:06 # description: 总测试次数:82 成功:61 失败:21 成功率:0.7439024390243902 from typing import List def batch_bool(instances: List[str], text: str) -> bool: """ 从列表中判断元素是否存在 Args: instances: 字段 text: 文本 Returns: 是否存在 """ for i in instances: if i in text: return True return False def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list: """ 从表格中抽取项目业绩 Args: table_list: PDF表格 instance: 抽取的字段 Returns: res 项目业绩表 """ res = [] for table in table_list: tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']] pages = table['page_numbers'] rows = [row[0] for row in tab] for i in rows: if batch_bool(instances, i): res.append({ "page_numbers": pages, "table": tab }) break return res if __name__ == '__main__': import json from pprint import pprint table_path = r'D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\三峡左岸及地下电站地坪整治\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json' with open(table_path, 'r', encoding='utf-8') as jsonfile: tables = json.load(jsonfile) pprint( extract_project( table_list=tables, instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间'] ) )