123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import time
- import json
- from typing import (
- List,
- Optional
- )
- from get_info import PdfExtractAttr
- ins = ['合同金额', '合同价格', '发包人名称']
- def batch_bool(instances: List[str], text: str) -> bool:
- for i in instances:
- if i in text:
- return True
- return False
- def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list:
- """
- 从表格中抽取项目业绩
- Args:
- instance: 抽取的字段
- table_dict: json表格
- table_path: 表格文件路径
- pdf_path: pdf源文件路径
- Returns:
- res 项目业绩表
- """
- if table_dict:
- tables = table_dict
- elif table_path:
- with open(table_path, 'r', encoding='utf-8') as jsonfile:
- tables = json.load(jsonfile)
- elif pdf_path:
- agent = PdfExtractAttr(file_path=pdf_path)
- tables = agent.parse_table_pro()
- else:
- raise ValueError("请输入需要解析的文件!")
- res = []
- for table in tables:
- tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
- pages = table['page_numbers']
- rows = [row[0] for row in tab]
- for i in rows:
- if batch_bool(instances, i):
- res.append({
- "page_numbers": pages,
- "table": tab
- })
- break
- return res
- if __name__ == '__main__':
- from pprint import pprint
- file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf'
- table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json'
- with open(table_path, 'r', encoding='utf-8') as jsonfile:
- tables = json.load(jsonfile)
- pprint(
- extract_project(
- instances=['合同金额', '合同价格', '发包人名称'],
- table_dict=tables
- )
- )
|