xzc
/
pdf_title_image


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							import time
import json
from typing import (
    List,
    Optional
)

from get_info import PdfExtractAttr


ins = ['合同金额', '合同价格', '发包人名称']


def batch_bool(instances: List[str], text: str) -> bool:
    for i in instances:
        if i in text:
            return True
    return False


def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list:
    """
    从表格中抽取项目业绩

    Args:
        instance:   抽取的字段
        table_dict: json表格
        table_path: 表格文件路径
        pdf_path:   pdf源文件路径

    Returns:
        res 项目业绩表
    """
    if table_dict:
        tables = table_dict
    elif table_path:
        with open(table_path, 'r', encoding='utf-8') as jsonfile:
            tables = json.load(jsonfile)
    elif pdf_path:
        agent = PdfExtractAttr(file_path=pdf_path)
        tables = agent.parse_table_pro()
    else:
        raise ValueError("请输入需要解析的文件！")

    res = []

    for table in tables:

        tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
        pages = table['page_numbers']

        rows = [row[0] for row in tab]

        for i in rows:
            if batch_bool(instances, i):
                res.append({
                    "page_numbers": pages,
                    "table": tab
                })
                break

    return res


if __name__ == '__main__':
    from pprint import pprint

    file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf'

    table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json'

    with open(table_path, 'r', encoding='utf-8') as jsonfile:
        tables = json.load(jsonfile)

    pprint(
        extract_project(
            instances=['合同金额', '合同价格', '发包人名称'],
            table_dict=tables
        )
    )