xzc
/
pdf_title_image


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
							from typing import List
from get_info import PdfExtractAttr
# from scan_dir import scan_dir
import time


ins = ['合同金额', '合同价格', '发包人名称']


def batch_bool(instances: List[str], text: str) -> bool:
    for i in instances:
        if i in text:
            return True
    return False


def extract_project(path: str, instances: List[str]):
    agent = PdfExtractAttr(file_path=path)
    tables = agent.parse_table_pro()
    res = []
    for table in tables:

        tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
        pages = table['page_numbers']

        rows = [row[0] for row in tab]

        for i in rows:
            # if '合同金额' in i or '合同价格' in i or '发包人名称' in i:
            if batch_bool(instances, i):
                res.append({
                    "page_numbers": pages,
                    "table": tab
                })
                break

    return res


if __name__ == '__main__':
    # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf')
    # start = time.time()
    # for f in (fs[:]):
    #     try:
    #         print(f)
    #         print(extract_project(f, ins))
    #         print('\n*********Runtime {} s *********\n'.format(time.time() - start))
    #     except BaseException as e:
    #         print('Something wrong')
    #         print(e)

    print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))