12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-09-03 16:03:06
- from typing import List
- from . import celery_app
- def batch_bool(instances: List[str], text: str) -> bool:
- """
- 从列表中判断元素是否存在
- Args:
- instances: 字段
- text: 文本
- Returns:
- 是否存在
- """
- for i in instances:
- if i in text:
- return True
- return False
- @celery_app.task
- def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list:
- """
- 从表格中抽取项目业绩
- Args:
- table_list: PDF表格
- instance: 抽取的字段
- Returns:
- res 项目业绩表
- """
- res = []
- for table in table_list:
- tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
- pages = table['page_numbers']
- rows = [row[0] for row in tab]
- for i in rows:
- if batch_bool(instances, i):
- res.append({
- "page_numbers": pages,
- "table": tab
- })
- break
- return res
- if __name__ == '__main__':
- import json
- from pprint import pprint
- table_path = r'D:\\desktop\\三峡水利\\data\\projects\\三峡左岸及地下电站地坪整治\\投标\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json'
- with open(table_path, 'r', encoding='utf-8') as jsonfile:
- tables = json.load(jsonfile)
- pprint(
- extract_project(
- table_list=tables,
- instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']
- )
- )
|