xzc
/
pdf_title_image


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
							# -*- coding: utf-8 -*-
# @Author: privacy
# @Date:   2024-06-11 13:43:14
# @Last Modified by:   privacy
# @Last Modified time: 2024-09-03 16:03:06
# description: 总测试次数：82 成功：61 失败：21 成功率：0.7439024390243902
from typing import List


def batch_bool(instances: List[str], text: str) -> bool:
    """
    从列表中判断元素是否存在
    Args:
        instances:  字段
        text:       文本

    Returns:
        是否存在
    """
    for i in instances:
        if i in text:
            return True
    return False


def extract_project(table_list: list, instances: List[str] = ['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']) -> list:
    """
    从表格中抽取项目业绩

    Args:
        table_list: PDF表格
        instance:   抽取的字段

    Returns:
        res 项目业绩表
    """
    res = []

    for table in table_list:

        tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
        pages = table['page_numbers']

        rows = [row[0] for row in tab]

        for i in rows:
            if batch_bool(instances, i):
                res.append({
                    "page_numbers": pages,
                    "table": tab
                })
                break

    return res


if __name__ == '__main__':
    import json
    from pprint import pprint

    table_path = r'D:\\desktop\\三峡水利\\data\\0预审查初审详审测试数据\\三峡左岸及地下电站地坪整治\\湖北建新建设工程有限公司_T221100130348%2F01整本文件\\投标文件-修改版9-5-1-1-table.json'

    with open(table_path, 'r', encoding='utf-8') as jsonfile:
        tables = json.load(jsonfile)

    pprint(
        extract_project(
            table_list=tables,
            instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间']
        )
    )