1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-09-03 16:03:06
- # description: 总测试次数:82 成功:61 失败:21 成功率:0.7439024390243902
- from typing import List
- def batch_bool(instances: List[str], text: str) -> bool:
- """
- 从列表中判断元素是否存在
- Args:
- instances: 字段
- text: 文本
- Returns:
- 是否存在
- """
- for i in instances:
- if i in text:
- return True
- return False
- def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list:
- """
- 从表格中抽取项目业绩
- Args:
- table_list: PDF表格
- instance: 抽取的字段
- Returns:
- res 项目业绩表
- """
- res = []
- for table in table_list:
- tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
- pages = table['page_numbers']
- rows = [row[0] for row in tab]
- for i in rows:
- if batch_bool(instances, i):
- res.append({
- "page_numbers": pages,
- "table": tab
- })
- break
- return res
- if __name__ == '__main__':
- import json
- from pprint import pprint
- table_path = r'D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\三峡左岸及地下电站地坪整治\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json'
- with open(table_path, 'r', encoding='utf-8') as jsonfile:
- tables = json.load(jsonfile)
- pprint(
- extract_project(
- table_list=tables,
- instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']
- )
- )
|