11 月之前 · 41baeabf79
--- a/api.py
+++ b/api.py
@@ -1,136 +1,136 @@
 
				-# -*- coding: utf-8 -*-
			
 
				-# @Author: privacy
			
 
				-# @Date:   2024-09-03 11:24:56
			
 
				-# @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-09-30 13:41:28
			
 
				-import os
			
 
				-import time
			
 
				-
			
 
				-import zipfile
			
 
				-import requests
			
 
				-from pydantic import BaseModel
			
 
				-from celery.result import AsyncResult
			
 
				-from fastapi import FastAPI, UploadFile, File, Body
			
 
				-
			
 
				-from celery_tasks import celery_app
			
 
				-from celery_tasks.commonprocess import bidding_factor, test_all_files
			
 
				-from celery_tasks.project_loc import extract_project
			
 
				-from celery_tasks.commonprocess import full_func
			
 
				-
			
 
				-
			
 
				-app = FastAPI()
			
 
				-
			
 
				-
			
 
				-@app.get('/result')
			
 
				-def back(taskid):
			
 
				-    result = AsyncResult(id=taskid, app=celery_app)
			
 
				-    if result.successful():
			
 
				-        val = result.get()
			
 
				-        return "执行完成，结果：%s" % val
			
 
				-    else:
			
 
				-        return '正在处理中...'
			
 
				-
			
 
				-
			
 
				-# 检测编码(已完成)
			
 
				-def decode_path(path):
			
 
				-    '''zipfile解压出现乱码，将乱码的路径编码为UTF8'''
			
 
				-    try:
			
 
				-        path_name = path.decode('utf-8')
			
 
				-    except Exception:
			
 
				-        path_name = path.encode('437').decode('gbk')
			
 
				-        path_name = path_name.encode('utf-8').decode('utf-8')
			
 
				-    return path_name
			
 
				-
			
 
				-
			
 
				-# 下载招标、投标、专家打分表图片文件
			
 
				-def get_file(path_name="1@/static/bm/bid_check_bidder/20240808151412_投标文件25mb.zip", save_file='./download'):
			
 
				-    url = 'http://192.168.1.111:9999/admin/sys-file/download'  # 要下载的文件的URL
			
 
				-    url = 'http://192.168.1.111:9999/admin/sys-file/preview'
			
 
				-    json = {"path": path_name}
			
 
				-
			
 
				-    os.makedirs(save_file, exist_ok=True)
			
 
				-    code = 0
			
 
				-    try:
			
 
				-        response = requests.get(url, data=json, stream=True)  # 发送GET请求，stream参数指定以流的方式下载文件
			
 
				-    except Exception:
			
 
				-        print('下载失败，状态码：', response.status_code)
			
 
				-        code = 1
			
 
				-    file_name = path_name.split('/')[-1]
			
 
				-
			
 
				-    save_file_path = save_file + '/new_' + file_name
			
 
				-
			
 
				-    if response.status_code == 200:  # 检查响应状态码
			
 
				-        with open(save_file_path, "wb") as fp:
			
 
				-            fp.write(response.content)
			
 
				-            print('文件下载完成！')
			
 
				-    if code != 0:  # 下载失败抓取
			
 
				-        return save_file_path, code
			
 
				-    if os.path.isfile(save_file_path) and save_file_path.endswith('.zip'):
			
 
				-        # 解压方式2：防止乱码
			
 
				-        tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
			
 
				-
			
 
				-        os.makedirs('./cache/' + tempdir, exist_ok=True)
			
 
				-        file_path_list = []
			
 
				-        try:
			
 
				-            with zipfile.ZipFile(save_file_path, allowZip64=True) as zf:
			
 
				-                # 排除目录文件
			
 
				-                file_iter = (filename for filename in zf.filelist if os.path.isfile(save_file_path))
			
 
				-                for filename in file_iter:
			
 
				-                    # 编码文件名称为 utf 格式
			
 
				-                    filename.filename = decode_path(filename.filename)  # 防止乱码的操作
			
 
				-                    zf.extract(filename, "./cache/" + tempdir)
			
 
				-                    if filename.filename[-1] == '/':
			
 
				-                        continue
			
 
				-                    file_path_list.append("./cache/" + tempdir + '/' + filename.filename)
			
 
				-        except Exception as e:
			
 
				-            print(e)
			
 
				-
			
 
				-        return file_path_list, code
			
 
				-
			
 
				-    return save_file_path, code
			
 
				-
			
 
				-
			
 
				-# 预审查、清标
			
 
				-@app.post("/file_upload")
			
 
				-async def file_upload(text_list=Body(None)):
			
 
				-    """
			
 
				-    {
			
 
				-        'bidderUnit': '杭州华新机电工程有限公司',
			
 
				-        'bidderFile': '1@/static/bm/bid_pre_check/20240924172133_三峡投标文件新-华新(1).zip',
			
 
				-        'buyFile': '1@/static/bm/project/20240924171822_三峡电站左岸厂房桥机远程智能化操作研发与实施重新招标.pdf',
			
 
				-        'reportFlag': '0',
			
 
				-        'projectName': '华新',
			
 
				-        'projectId': '2024-9-24-0'
			
 
				-    }
			
 
				-    """
			
 
				-    try:
			
 
				-        json_post = eval(text_list)
			
 
				-    except Exception:
			
 
				-        json_post = text_list
			
 
				-
			
 
				-    buyFile = json_post['buyFile']  # 采购文件
			
 
				-
			
 
				-    bidderFile = json_post['bidderFile']  # 投标文件
			
 
				-
			
 
				-    try:
			
 
				-        buy_file_path, code = get_file(buyFile)
			
 
				-
			
 
				-        bidder_file_path, code = get_file(bidderFile)
			
 
				-
			
 
				-        json_data = {
			
 
				-            "code": 0,
			
 
				-            "name": '',
			
 
				-            "msg": "文件下载成功",
			
 
				-            "data": {},
			
 
				-            "ok": True
			
 
				-        }
			
 
				-    except Exception:
			
 
				-        json_data = {
			
 
				-            "code": 1,
			
 
				-            "name": '',
			
 
				-            "msg": "文件下载失败",
			
 
				-            "data": {},
			
 
				-            "ok": False
			
 
				-        }
			
 
				-
			
 
				-    return json_data
			
 
				+# # -*- coding: utf-8 -*-
			
 
				+# # @Author: privacy
			
 
				+# # @Date:   2024-09-03 11:24:56
			
 
				+# # @Last Modified by:   privacy
			
 
				+# # @Last Modified time: 2024-12-23 14:38:28
			
 
				+# import os
			
 
				+# import time
			
 
				+
			
 
				+# import zipfile
			
 
				+# import requests
			
 
				+# from pydantic import BaseModel
			
 
				+# from celery.result import AsyncResult
			
 
				+# from fastapi import FastAPI, UploadFile, File, Body
			
 
				+
			
 
				+# from celery_tasks import celery_app
			
 
				+# from celery_tasks.commonprocess import bidding_factor, test_all_files
			
 
				+# from celery_tasks.project_loc import extract_project
			
 
				+# from celery_tasks.commonprocess import full_func
			
 
				+
			
 
				+
			
 
				+# app = FastAPI()
			
 
				+
			
 
				+
			
 
				+# @app.get('/result')
			
 
				+# def back(taskid):
			
 
				+#     result = AsyncResult(id=taskid, app=celery_app)
			
 
				+#     if result.successful():
			
 
				+#         val = result.get()
			
 
				+#         return "执行完成，结果：%s" % val
			
 
				+#     else:
			
 
				+#         return '正在处理中...'
			
 
				+
			
 
				+
			
 
				+# # 检测编码(已完成)
			
 
				+# def decode_path(path):
			
 
				+#     '''zipfile解压出现乱码，将乱码的路径编码为UTF8'''
			
 
				+#     try:
			
 
				+#         path_name = path.decode('utf-8')
			
 
				+#     except Exception:
			
 
				+#         path_name = path.encode('437').decode('gbk')
			
 
				+#         path_name = path_name.encode('utf-8').decode('utf-8')
			
 
				+#     return path_name
			
 
				+
			
 
				+
			
 
				+# # 下载招标、投标、专家打分表图片文件
			
 
				+# def get_file(path_name="1@/static/bm/bid_check_bidder/20240808151412_投标文件25mb.zip", save_file='./download'):
			
 
				+#     url = 'http://192.168.1.111:9999/admin/sys-file/download'  # 要下载的文件的URL
			
 
				+#     url = 'http://192.168.1.111:9999/admin/sys-file/preview'
			
 
				+#     json = {"path": path_name}
			
 
				+
			
 
				+#     os.makedirs(save_file, exist_ok=True)
			
 
				+#     code = 0
			
 
				+#     try:
			
 
				+#         response = requests.get(url, data=json, stream=True)  # 发送GET请求，stream参数指定以流的方式下载文件
			
 
				+#     except Exception:
			
 
				+#         print('下载失败，状态码：', response.status_code)
			
 
				+#         code = 1
			
 
				+#     file_name = path_name.split('/')[-1]
			
 
				+
			
 
				+#     save_file_path = save_file + '/new_' + file_name
			
 
				+
			
 
				+#     if response.status_code == 200:  # 检查响应状态码
			
 
				+#         with open(save_file_path, "wb") as fp:
			
 
				+#             fp.write(response.content)
			
 
				+#             print('文件下载完成！')
			
 
				+#     if code != 0:  # 下载失败抓取
			
 
				+#         return save_file_path, code
			
 
				+#     if os.path.isfile(save_file_path) and save_file_path.endswith('.zip'):
			
 
				+#         # 解压方式2：防止乱码
			
 
				+#         tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
			
 
				+
			
 
				+#         os.makedirs('./cache/' + tempdir, exist_ok=True)
			
 
				+#         file_path_list = []
			
 
				+#         try:
			
 
				+#             with zipfile.ZipFile(save_file_path, allowZip64=True) as zf:
			
 
				+#                 # 排除目录文件
			
 
				+#                 file_iter = (filename for filename in zf.filelist if os.path.isfile(save_file_path))
			
 
				+#                 for filename in file_iter:
			
 
				+#                     # 编码文件名称为 utf 格式
			
 
				+#                     filename.filename = decode_path(filename.filename)  # 防止乱码的操作
			
 
				+#                     zf.extract(filename, "./cache/" + tempdir)
			
 
				+#                     if filename.filename[-1] == '/':
			
 
				+#                         continue
			
 
				+#                     file_path_list.append("./cache/" + tempdir + '/' + filename.filename)
			
 
				+#         except Exception as e:
			
 
				+#             print(e)
			
 
				+
			
 
				+#         return file_path_list, code
			
 
				+
			
 
				+#     return save_file_path, code
			
 
				+
			
 
				+
			
 
				+# # 预审查、清标
			
 
				+# @app.post("/file_upload")
			
 
				+# async def file_upload(text_list=Body(None)):
			
 
				+#     """
			
 
				+#     {
			
 
				+#         'bidderUnit': '杭州华新机电工程有限公司',
			
 
				+#         'bidderFile': '1@/static/bm/bid_pre_check/20240924172133_三峡投标文件新-华新(1).zip',
			
 
				+#         'buyFile': '1@/static/bm/project/20240924171822_三峡电站左岸厂房桥机远程智能化操作研发与实施重新招标.pdf',
			
 
				+#         'reportFlag': '0',
			
 
				+#         'projectName': '华新',
			
 
				+#         'projectId': '2024-9-24-0'
			
 
				+#     }
			
 
				+#     """
			
 
				+#     try:
			
 
				+#         json_post = eval(text_list)
			
 
				+#     except Exception:
			
 
				+#         json_post = text_list
			
 
				+
			
 
				+#     buyFile = json_post['buyFile']  # 采购文件
			
 
				+
			
 
				+#     bidderFile = json_post['bidderFile']  # 投标文件
			
 
				+
			
 
				+#     try:
			
 
				+#         buy_file_path, code = get_file(buyFile)
			
 
				+
			
 
				+#         bidder_file_path, code = get_file(bidderFile)
			
 
				+
			
 
				+#         json_data = {
			
 
				+#             "code": 0,
			
 
				+#             "name": '',
			
 
				+#             "msg": "文件下载成功",
			
 
				+#             "data": {},
			
 
				+#             "ok": True
			
 
				+#         }
			
 
				+#     except Exception:
			
 
				+#         json_data = {
			
 
				+#             "code": 1,
			
 
				+#             "name": '',
			
 
				+#             "msg": "文件下载失败",
			
 
				+#             "data": {},
			
 
				+#             "ok": False
			
 
				+#         }
			
 
				+
			
 
				+#     return json_data
			
--- a/app.py
+++ b/app.py
@@ -0,0 +1,135 @@
 
				+#!/usr/bin/python
			
 
				+# -*- coding=utf-8 -*-
			
 
				+# @Create Time:		2024-08-05 15:12:31
			
 
				+# @Last Modified time: 2024-12-25 16:17:05
			
 
				+import os
			
 
				+os.environ['TRANSFORMERS_OFFLINE'] = '1'
			
 
				+import uuid
			
 
				+from typing import List, Literal, Optional, Union
			
 
				+
			
 
				+import torch
			
 
				+import uvicorn
			
 
				+import numpy as np
			
 
				+from pydantic import BaseModel, Field
			
 
				+from fastapi import FastAPI, File, UploadFile, Form, BackgroundTasks, HTTPException
			
 
				+# from flask import Flask, jsonify, request
			
 
				+from transformers import AutoTokenizer, AutoModel
			
 
				+
			
 
				+from celery_tasks.all_instance import detail_task
			
 
				+
			
 
				+
			
 
				+app = FastAPI()
			
 
				+# app = Flask(__name__)
			
 
				+
			
 
				+
			
 
				+class ModelLoader:
			
 
				+    _instance = None
			
 
				+
			
 
				+    def __new__(cls):
			
 
				+        if cls._instance is None:
			
 
				+            cls._instance = super().__new__(cls)
			
 
				+            # 加载模型代码
			
 
				+            cls._instance.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
			
 
				+            cls._instance.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
			
 
				+        return cls._instance
			
 
				+
			
 
				+
			
 
				+# model_loader = ModelLoader()
			
 
				+
			
 
				+
			
 
				+class EmbeddingInput(BaseModel):
			
 
				+    input: str
			
 
				+    model: Optional[str] = "text2vec-base-chinese"
			
 
				+
			
 
				+
			
 
				+class Embeding(BaseModel):
			
 
				+    embedding: Optional[list] = []
			
 
				+    index: Optional[int] = 0
			
 
				+    object: Optional[str] = 'embedding'
			
 
				+
			
 
				+
			
 
				+class Usage(BaseModel):
			
 
				+    prompt_tokens: int
			
 
				+    total_tokens: int
			
 
				+
			
 
				+
			
 
				+class EmbedingResponse(BaseModel):
			
 
				+    data: List[Embeding]
			
 
				+    model: Optional[str] = 'text2vec-base-chinese'
			
 
				+    object: Optional[str] = 'list'
			
 
				+    usage: Usage
			
 
				+
			
 
				+
			
 
				+class ResponseModel(BaseModel):
			
 
				+    error_code: Optional[int] = 0
			
 
				+    error_msg: Optional[str] = ''
			
 
				+    log_id: Optional[str] = ''
			
 
				+    result: Optional[dict] = {}
			
 
				+
			
 
				+
			
 
				+# @app.post('/v1/embeddings', response_model=EmbedingResponse)
			
 
				+# async def create_embeding(request: EmbeddingInput):
			
 
				+#     encoded_input = model_loader.tokenizer(request.input, return_tensors='pt')
			
 
				+#     with torch.no_grad():
			
 
				+#         output = model_loader.model(**encoded_input)
			
 
				+#     text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0).tolist()
			
 
				+#     return EmbedingResponse(
			
 
				+#         data=[
			
 
				+#             Embeding(embedding=text_embedding)
			
 
				+#         ],
			
 
				+#         usage=Usage(
			
 
				+#             prompt_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1],
			
 
				+#             total_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1]
			
 
				+#         )
			
 
				+#     )
			
 
				+
			
 
				+
			
 
				+# @app.route('/v1/embeddings', methods=['POST'])
			
 
				+# def create_embeding(request: EmbeddingInput):
			
 
				+#     encoded_input = model_loader.tokenizer(request.input, return_tensors='pt')
			
 
				+#     with torch.no_grad():
			
 
				+#         output = model_loader.model(**encoded_input)
			
 
				+#     text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0).tolist()
			
 
				+#     return EmbedingResponse(
			
 
				+#         data=[
			
 
				+#             Embeding(embedding=text_embedding)
			
 
				+#         ],
			
 
				+#         usage=Usage(
			
 
				+#             prompt_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1],
			
 
				+#             total_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1]
			
 
				+#         )
			
 
				+#     )
			
 
				+
			
 
				+
			
 
				+@app.post('/detail_check', response_model=ResponseModel)
			
 
				+async def predict(background_task: BackgroundTasks, projectId: str = Form(), projectName: str = Form(), bidderUnit: str = Form(), zb_filename: str = Form(), tb_filename: str = Form(), files: List[UploadFile] = File(...)):
			
 
				+    for file in files:
			
 
				+        if file.filename == zb_filename:
			
 
				+            print('招标文件')
			
 
				+            zb_file = f'./tmp/zb_file-{uuid.uuid4()}.pdf'
			
 
				+            zb_res = await file.read()
			
 
				+            with open(zb_file, 'wb') as f:
			
 
				+                f.write(zb_res)
			
 
				+        elif file.filename == tb_filename:
			
 
				+            print('投标文件')
			
 
				+            tb_file = f'./tmp/tb_file-{uuid.uuid4()}.json'
			
 
				+            tb_res = await file.read()
			
 
				+            with open(tb_file, 'wb') as f:
			
 
				+                f.write(tb_res)
			
 
				+        else:
			
 
				+            return ResponseModel(error_code=1, error_msg='未识别文件')
			
 
				+    background_task.add_task(detail_task, zb_file=zb_file, tb_file=tb_file, tb_filename=tb_filename, projectId=projectId, project=projectName, supplier=bidderUnit)
			
 
				+    return ResponseModel(result={"task_id": f"{uuid.uuid4()}"})
			
 
				+
			
 
				+
			
 
				+# @app.route('/detail_check', methods=['POST'])
			
 
				+# def predict():
			
 
				+#     tb_file = request.files['tb']
			
 
				+#     zb_file = request.files['zb']
			
 
				+#     tb_bytes = tb_file.read()
			
 
				+#     zb_bytes = zb_file.read()
			
 
				+#     return ResponseModel(result={"task_id": "T000001"})
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    uvicorn.run(app, host='0.0.0.0', port=5000)
			
--- a/celery_tasks/LLMAgent.py
+++ b/celery_tasks/LLMAgent.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-12-02 16:50:32
			
 
				+# @Last Modified time: 2024-12-23 18:17:53
			
 
				 import re
			
 
				 import json
			
 
				 from enum import Enum
			
@@ -158,8 +158,10 @@ def get_proj(input_json: dict, standard: str):
 
				 
			
 
				     client = instructor.from_openai(
			
 
				         OpenAI(
			
 
				-            base_url='http://180.76.147.97:11434/v1',
			
 
				-            api_key='ollama'
			
 
				+            # base_url='http://180.76.147.97:11434/v1',
			
 
				+            base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
			
 
				+            # api_key='ollama'
			
 
				+            api_key='sk-45971a6af6d94ccd89321f8f6d370b38'
			
 
				         ),
			
 
				         mode=instructor.Mode.JSON,
			
 
				     )
			
@@ -170,7 +172,8 @@ def get_proj(input_json: dict, standard: str):
 
				     ]
			
 
				 
			
 
				     response = client.chat.completions.create(
			
 
				-        model='qwen2.5:7b',
			
 
				+        # model='qwen2.5:7b',
			
 
				+        model='qwen-turbo',
			
 
				         # model='wangshenzhi/llama3.1_8b_chinese_chat:latest',
			
 
				         response_model=ResInfo,
			
 
				         messages=messages,
			
--- a/celery_tasks/__init__.py
+++ b/celery_tasks/__init__.py
@@ -2,12 +2,13 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-08-27 11:19:15
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-09-29 13:32:08
			
 
				+# @Last Modified time: 2024-12-27 11:35:29
			
 
				 from celery import Celery
			
 
				 
			
 
				 celery_app = Celery(
			
 
				     'tasks',
			
 
				-    broker='redis://:test@127.0.0.1:6379/0',
			
 
				+    broker='redis://:redis123@192.168.1.202:6387/3',
			
 
				+    # broker='redis://:test@127.0.0.1:6379/0',
			
 
				     # backend='redis://:test@127.0.0.1:6379/1',
			
 
				     backend='db+sqlite:///celeryresults.sqlite3',
			
 
				     include=[  # 执行任务库，代表所能执行的所有任务，即通过@celery_app.task修饰的所有函数
			
--- a/celery_tasks/all_instance.py
+++ b/celery_tasks/all_instance.py
@@ -0,0 +1,68 @@
 
				+#!/usr/bin/python
			
 
				+# -*- coding=utf-8 -*-
			
 
				+# @Create Time:		2024-12-23 15:22:39
			
 
				+# @Last Modified time: 2024-12-25 16:17:09
			
 
				+import uuid
			
 
				+import time
			
 
				+import json
			
 
				+import requests
			
 
				+from .get_tender_info import PdfExtractAttr_
			
 
				+from .document_ import DocumentPreReview
			
 
				+from .busi_instance import busi_loc
			
 
				+from .tech_instance import tech_loc
			
 
				+from .parse_textmind_result import get_ocr_new
			
 
				+
			
 
				+
			
 
				+def detail_task(zb_file: str, tb_file: str, tb_filename: str, projectId: str, project: str, supplier: str):
			
 
				+    host = '192.168.1.111:9999'
			
 
				+
			
 
				+    detail_check_url = f'http://{host}/bm/alg/result_check_detail'
			
 
				+
			
 
				+    agent = PdfExtractAttr_(file_path=zb_file)
			
 
				+    zb_table_path = f'./tmp/zb_table-{uuid.uuid4()}.json'
			
 
				+    agent.parse_table_pro(table_path=zb_table_path)
			
 
				+
			
 
				+    dpr = DocumentPreReview(zb_table_path)
			
 
				+    scrutinize_dict = dpr.get_table()
			
 
				+    print(scrutinize_dict)
			
 
				+
			
 
				+    with open(tb_file, 'r', encoding='utf-8') as fp:
			
 
				+        raw = json.load(fp)
			
 
				+    result = get_ocr_new(raw=raw, pretty=True)
			
 
				+
			
 
				+    busi_score = busi_loc(
			
 
				+        scrutinize_dict=scrutinize_dict,
			
 
				+        outline_dict=result['outline'],
			
 
				+        title_list=result['title'],
			
 
				+        table_list=result['tables'],
			
 
				+        image_list=result['images'],
			
 
				+        supplier=supplier,
			
 
				+        project=project,
			
 
				+        file_name=tb_filename.split('.')[0] + 'pdf'
			
 
				+    )
			
 
				+    busi_score['name'] = '商务部分评分标准'
			
 
				+
			
 
				+    tech_score = tech_loc(
			
 
				+        scrutinize_dict=scrutinize_dict,
			
 
				+        outline_dict=result['outline'],
			
 
				+        content_list=result['contents'],
			
 
				+        supplier=supplier,
			
 
				+        project=project,
			
 
				+        file_name=tb_filename.split('.')[0] + 'pdf'
			
 
				+    )
			
 
				+    tech_score['name'] = '技术部分评分标准'
			
 
				+
			
 
				+    detail_data = {
			
 
				+        'projectId': projectId,
			
 
				+        'projectName': project,
			
 
				+        'bidderUnit': supplier,
			
 
				+        'list': [busi_score, tech_score]
			
 
				+    }
			
 
				+
			
 
				+    print(detail_data)
			
 
				+
			
 
				+    detail_check_response = requests.post(url=detail_check_url, json=detail_data)
			
 
				+
			
 
				+    print("detail_check_response: ", detail_check_response.json())
			
 
				+
			
 
				+    return
			
--- a/celery_tasks/document_.py
+++ b/celery_tasks/document_.py
@@ -4,20 +4,25 @@
 
				 1. 解析Bidding_document_extract中all_tables.json结果
			
 
				 '''
			
 
				 import re
			
 
				-
			
 
				+import json
			
 
				 from celery_tasks.tools import BaseMethods
			
 
				 
			
 
				 
			
 
				 class DocumentPreReview:
			
 
				-    def _scrutinize_judge(self, tag:str, threshold_value:int=3):
			
 
				-        ''' Clause number content judgment 
			
 
				-            商务 技术 报价 评审 评分 标准
			
 
				+    def __init__(self, table_path: str):
			
 
				+        with open(table_path, 'r', encoding='utf-8') as fp:
			
 
				+            self.Bidding_tables = json.load(fp)
			
 
				+
			
 
				+    def _scrutinize_judge(self, tag: str, threshold_value: int = 3):
			
 
				+        '''
			
 
				+        Clause number content judgment
			
 
				+        商务 技术 报价 评审 评分 标准
			
 
				         '''
			
 
				-        scrutinize_tuple = ("商务","技术","报价","评审","评分","标准","部分")
			
 
				+        scrutinize_tuple = ("商务", "技术", "报价", "评审", "评分", "标准", "部分")
			
 
				         hit_num = 0
			
 
				         for scru in scrutinize_tuple:
			
 
				-            if scru in tag: hit_num+= 1
			
 
				-        if hit_num>=threshold_value: return True
			
 
				+            if scru in tag: hit_num += 1
			
 
				+        if hit_num >= threshold_value: return True
			
 
				         else: return False
			
 
				 
			
 
				     def check_table(self, all_tables):
			
@@ -95,9 +100,6 @@ class DocumentPreReview:
 
				                 tables_list.append(partial_form)
			
 
				         return tables_list
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				     def get_table(self):
			
 
				         ''' parse the Bidding_tables.json file to get the table data from it.
			
 
				         '''
			
--- a/celery_tasks/get_tender_info.py
+++ b/celery_tasks/get_tender_info.py
@@ -0,0 +1,573 @@
 
				+'''招标文件内容提取'''
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import pdfplumber
			
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+import cv2
			
 
				+from io import BytesIO
			
 
				+
			
 
				+from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
			
 
				+from pdfminer.high_level import extract_pages
			
 
				+from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
			
 
				+from pdfminer.pdftypes import (
			
 
				+    LITERALS_DCT_DECODE,
			
 
				+    LITERALS_JBIG2_DECODE,
			
 
				+    LITERALS_JPX_DECODE,
			
 
				+    LITERALS_FLATE_DECODE,
			
 
				+)
			
 
				+from pprint import pprint
			
 
				+from pdfminer.pdfparser import PDFParser, PDFSyntaxError
			
 
				+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
			
 
				+import pdfplumber
			
 
				+import camelot
			
 
				+
			
 
				+from .tools import RefPageNumberResolver
			
 
				+
			
 
				+HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价（元）', '含税合价（元）', '条款号', '评分因素', '评分标准', '页码'})
			
 
				+HEADERS |= set({'条款号' ,'评审因素' ,'评审标准', ''})
			
 
				+
			
 
				+
			
 
				+
			
 
				+def is_title(line: str) -> bool:
			
 
				+    title_word = re.findall('^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
			
 
				+    if title_word:
			
 
				+        return True
			
 
				+    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
			
 
				+    if title_word:
			
 
				+        return True
			
 
				+    return False
			
 
				+
			
 
				+PIL_ERROR_MESSAGE = "PIL导入错误"
			
 
				+def _save_jpeg(image: LTImage, path: str) -> str:
			
 
				+    """Save a JPEG encoded image"""
			
 
				+    raw_data = image.stream.get_rawdata()
			
 
				+    assert raw_data is not None
			
 
				+
			
 
				+    path = path + ".jpg"
			
 
				+
			
 
				+    with open(path, "wb") as fp:
			
 
				+        if LITERAL_DEVICE_CMYK in image.colorspace:
			
 
				+            try: 
			
 
				+                from PIL import Image, ImageChops  # type: ignore[import] 
			
 
				+            except ImportError: 
			
 
				+                raise ImportError(PIL_ERROR_MESSAGE) 
			
 
				+ 
			
 
				+            ifp = BytesIO(raw_data)
			
 
				+            i = Image.open(ifp)
			
 
				+            i = ImageChops.invert(i)
			
 
				+            i = i.convert("RGB")
			
 
				+            i.save(fp, "JPEG")
			
 
				+        else:
			
 
				+            fp.write(raw_data)
			
 
				+
			
 
				+    return path
			
 
				+
			
 
				+def _save_jpeg2000(image: LTImage, path: str) -> str:
			
 
				+    """Save a JPEG 2000 encoded image"""
			
 
				+    raw_data = image.stream.get_rawdata()
			
 
				+    assert raw_data is not None
			
 
				+
			
 
				+    path = path + ".png"
			
 
				+
			
 
				+    try:
			
 
				+        from PIL import Image  # type: ignore[import]
			
 
				+    except ImportError:
			
 
				+        raise ImportError(PIL_ERROR_MESSAGE)
			
 
				+
			
 
				+    # 如果我们只写原始数据，我尝试过的大多数图像程序都无法打开文件。
			
 
				+    # 然而，使用OpenCV2打开和保存会生成一个文件，该文件似乎很容易被其他程序打开
			
 
				+    ifp = BytesIO(raw_data)
			
 
				+    i = Image.open(ifp)
			
 
				+    opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
			
 
				+    cv2.imwrite(path, opencv_image)
			
 
				+    return path
			
 
				+
			
 
				+def export_image(image: LTImage, path: str) -> str:
			
 
				+    """Save an LTImage to disk"""
			
 
				+    (width, height) = image.srcsize
			
 
				+
			
 
				+    filters = image.stream.get_filters()
			
 
				+
			
 
				+    if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
			
 
				+        name = _save_jpeg(image, path)
			
 
				+        return name
			
 
				+
			
 
				+    elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
			
 
				+        name = _save_jpeg2000(image, path)
			
 
				+        return name
			
 
				+
			
 
				+    data = image.stream.get_data()
			
 
				+    raw_data = image.stream.get_rawdata()
			
 
				+
			
 
				+    if data:
			
 
				+        if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
			
 
				+            path += '.jpg'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+        elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
			
 
				+            path += '.png'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+        elif data[:2] == b'\x42\x4d':
			
 
				+            path += '.bmp'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+        elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
			
 
				+            path += '.gif'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+        elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
			
 
				+            path += '.tiff'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+        else:
			
 
				+            path += '.unk'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(data)
			
 
				+            return path
			
 
				+    elif raw_data:
			
 
				+        if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
			
 
				+            path += '.jpg'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+        elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
			
 
				+            path += '.png'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+        elif raw_data[:2] == b'\x42\x4d':
			
 
				+            path += '.bmp'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+        elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
			
 
				+            path += '.gif'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+        elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
			
 
				+            path += '.tiff'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+        else:
			
 
				+            path += '.unk'
			
 
				+            with open(path, 'wb') as file:
			
 
				+                file.write(raw_data)
			
 
				+            return path
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
			
 
				+    texts = []
			
 
				+    images = []
			
 
				+    # 读取PDF文件并提取页面
			
 
				+    for page_number, page_layout in enumerate(extract_pages(pdf_path)):
			
 
				+        title_index = 0
			
 
				+        image_index = 0
			
 
				+        for element in page_layout:
			
 
				+            if isinstance(element, LTLine):
			
 
				+                pass
			
 
				+            elif isinstance(element, LTRect):
			
 
				+                pass
			
 
				+            elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
			
 
				+                text = element.get_text().strip()
			
 
				+                # # 假设标题通常是一行且字体较大
			
 
				+                if text and (is_title(text) or element.height > 15):
			
 
				+                    texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
			
 
				+                    title_index += 1
			
 
				+            # elif isinstance(element, LTFigure):
			
 
				+            #     for e_obj in element._objs:
			
 
				+            #         if isinstance(e_obj, LTImage):
			
 
				+            #             # 提取图片数据
			
 
				+            #             image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
			
 
				+            #             image_file = export_image(e_obj, image_file)
			
 
				+            #             images.append(image_file)
			
 
				+            #             pprint(f'Image saved: {image_file}')
			
 
				+            #             image_index += 1
			
 
				+
			
 
				+    with open(title_path, 'w', encoding='utf-8') as fp:
			
 
				+        json.dump(texts, fp, indent=4, ensure_ascii=False)
			
 
				+    return title_path,image_dir
			
 
				+
			
 
				+from typing import Optional, List
			
 
				+def parse_title(file_path: str, title_path: Optional[str] = None) -> list:
			
 
				+        """
			
 
				+        标题解析，用于报价唯一
			
 
				+
			
 
				+        Args:
			
 
				+            title_path: 保存路径
			
 
				+
			
 
				+        Returns:
			
 
				+            results:    标题列表
			
 
				+        """
			
 
				+        results = []
			
 
				+
			
 
				+        seq_num = 0
			
 
				+
			
 
				+        for page_number, page_layout in enumerate(extract_pages(file_path)):
			
 
				+            title_index = 0
			
 
				+            for element in page_layout:
			
 
				+                if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
			
 
				+                    text = element.get_text().strip()
			
 
				+                    if text and (is_title(text) or element.height > 15):
			
 
				+                        results.append({
			
 
				+                            'index': title_index,
			
 
				+                            'page_number': page_number,
			
 
				+                            'bbox': element.bbox,
			
 
				+                            'text': text,
			
 
				+                            'title': text,
			
 
				+                            'seq_num': seq_num
			
 
				+                        })
			
 
				+                        seq_num += 1
			
 
				+                        title_index += 1
			
 
				+
			
 
				+        if title_path:
			
 
				+            with open(title_path, 'w', encoding='utf-8') as fp:
			
 
				+                json.dump(results, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+        return title_path
			
 
				+
			
 
				+def parse_image(file_path: str, image_dir: str, image_meta_path: str) -> List[dict]:
			
 
				+    """
			
 
				+    解析PDF中的图片
			
 
				+    Args:
			
 
				+        image_dir:  解析目录
			
 
				+
			
 
				+    Returns:
			
 
				+        image_list: 图片列表
			
 
				+    """
			
 
				+    image_list = []
			
 
				+
			
 
				+    for page_number, page_layout in enumerate(extract_pages(file_path)):
			
 
				+        image_index = 0
			
 
				+        for element in page_layout:
			
 
				+            if isinstance(element, LTFigure):
			
 
				+                for e_obj in element._objs:
			
 
				+                    if isinstance(e_obj, LTImage):
			
 
				+                        # 提取图片数据
			
 
				+                        image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
			
 
				+                        image_file = export_image(e_obj, image_file)
			
 
				+                        image_list.append({
			
 
				+                            "image_index": image_index,
			
 
				+                            "page_number": page_number,
			
 
				+                            "image_name": image_file
			
 
				+                        })
			
 
				+                        image_index += 1
			
 
				+
			
 
				+    if image_meta_path:
			
 
				+        with open(image_meta_path, 'w', encoding='utf-8') as fp:
			
 
				+            json.dump(image_list, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+    return image_meta_path
			
 
				+
			
 
				+
			
 
				+def table_parse(pdf_path: str, 
			
 
				+                title_path: str, 
			
 
				+                start_title: str = '第三章 评标办法(综合评估法)', 
			
 
				+                end_title: str = '第四章 合同条款及格式', 
			
 
				+                table_path: str = None, 
			
 
				+                start_page_number: int = None, 
			
 
				+                end_page_number: int = None
			
 
				+            ) -> list:
			
 
				+    """pdf表格解析功能
			
 
				+    @pdf_path
			
 
				+    @title_path
			
 
				+    @start_title
			
 
				+    @end_title
			
 
				+    @table_path
			
 
				+    @start_page_number
			
 
				+    @end_page_number
			
 
				+    """
			
 
				+    tables = []
			
 
				+
			
 
				+    if (start_page_number == None) or (end_page_number == None):
			
 
				+        df = pd.read_json(title_path)
			
 
				+        start_page_number = df[df['text'] == start_title].page_number.max()
			
 
				+        end_page_number = df[df['text'] == end_title].page_number.max()
			
 
				+
			
 
				+    def concat_table(tables, table):
			
 
				+        """尝试将表添加到结果列中，有两种情况，直接添加一个新表；拼接最后一个表
			
 
				+        @tables
			
 
				+        @table
			
 
				+        """
			
 
				+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
			
 
				+        tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
			
 
				+        if len(table) > 1:
			
 
				+            second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
			
 
				+        # pprint(first)
			
 
				+        if len(HEADERS & set(first)) > 2:
			
 
				+            # pprint("找到大量表头元素，判断为独立表头，生成新表!")
			
 
				+            tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
			
 
				+        elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
			
 
				+            # pprint("有空列，不是单独表，直接合并")
			
 
				+            tables[-1]['page_numbers'].append(i)
			
 
				+            tables[-1]['table'].extend(table)
			
 
				+        else:
			
 
				+            tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
			
 
				+        return tables
			
 
				+
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        print(start_page_number, end_page_number)
			
 
				+        for i in range(start_page_number, end_page_number):
			
 
				+            for table in pdf.pages[i].extract_tables():
			
 
				+                tables = concat_table(tables, table)
			
 
				+
			
 
				+    with open(table_path, 'w', encoding='utf-8') as fp:
			
 
				+        json.dump(tables, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+    return table_path
			
 
				+
			
 
				+
			
 
				+class PdfExtractAttr_(object):
			
 
				+    def __init__(self, file_path: str):
			
 
				+        """PDF文件解析
			
 
				+        @file_path
			
 
				+        """
			
 
				+        super(PdfExtractAttr_, self).__init__()
			
 
				+        self.file_path = file_path
			
 
				+        self.details = []
			
 
				+        self.tables = []
			
 
				+        self.content = []
			
 
				+        self.chapters = []
			
 
				+        self.references = []
			
 
				+        self.detail_df = None
			
 
				+        self.outlines = None
			
 
				+
			
 
				+    def parse_outline(self):
			
 
				+        """PDF大纲解析
			
 
				+        """
			
 
				+        results = []
			
 
				+        with open(self.file_path, "rb") as fp:
			
 
				+            try:
			
 
				+                parser = PDFParser(fp)
			
 
				+                document = PDFDocument(parser)
			
 
				+                ref_pagenum_resolver = RefPageNumberResolver(document)
			
 
				+                outlines = document.get_outlines()
			
 
				+                for (level, title, dest, a, se) in outlines:
			
 
				+                    if dest:
			
 
				+                        page_num = ref_pagenum_resolver.resolve(dest)
			
 
				+                    elif a:
			
 
				+                        page_num = ref_pagenum_resolver.resolve(a)
			
 
				+                    elif se:
			
 
				+                        page_num = ref_pagenum_resolver.resolve(se)
			
 
				+                    else:
			
 
				+                        page_num = None
			
 
				+                    results.append({'level': level, 'title': title, 'page_number': page_num})
			
 
				+            except PDFNoOutlines:
			
 
				+                print("No outlines found.")
			
 
				+            except PDFSyntaxError:
			
 
				+                print("Corrupted PDF or non-PDF file.")
			
 
				+            finally:
			
 
				+                parser.close()
			
 
				+
			
 
				+        with open('outlines.json', 'w', encoding='utf-8') as op:
			
 
				+            json.dump(results, op, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+        # print(results)
			
 
				+    
			
 
				+    def extract_content(self, content_path: str = None) -> list:
			
 
				+        with pdfplumber.open(self.file_path) as pdf:
			
 
				+            for page in pdf.pages:
			
 
				+                self.content.append({
			
 
				+                    'page_number': page.page_number - 1,
			
 
				+                    'text': page.extract_text()
			
 
				+                })
			
 
				+        
			
 
				+        
			
 
				+        with open(content_path, 'w', encoding='utf-8') as fp:
			
 
				+            json.dump(self.content, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+        return content_path
			
 
				+
			
 
				+    def parse_text(self) -> None:
			
 
				+        """文本解析
			
 
				+        """
			
 
				+        for page_number, page_layout in enumerate(extract_pages(self.file_path)):
			
 
				+            for element in page_layout:
			
 
				+                if isinstance(element, LTTextBoxHorizontal):
			
 
				+                    # 距离左侧
			
 
				+                    left = element.x0
			
 
				+                    # 距离右侧
			
 
				+                    right = (page_layout.width - element.x1)
			
 
				+                    # 距离上侧
			
 
				+                    top = (page_layout.height - element.y1)
			
 
				+                    # 距离下侧
			
 
				+                    button = element.y0
			
 
				+                    # 文本宽度
			
 
				+                    width = element.width
			
 
				+                    if (left > right) and (abs(left - right) > 100):
			
 
				+                        alignment = 'right'
			
 
				+                    elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
			
 
				+                        alignment = 'center'
			
 
				+                    else:
			
 
				+                        alignment = 'left'
			
 
				+                    self.details.append({
			
 
				+                        'page_number': page_number,
			
 
				+                        'index': element.index,
			
 
				+                        'x0': element.bbox[0],
			
 
				+                        'y0': element.bbox[1],
			
 
				+                        'x1': element.bbox[2],
			
 
				+                        'y1': element.bbox[3],
			
 
				+                        'alignment': alignment,
			
 
				+                        'lines': len(element._objs),
			
 
				+                        'text': element.get_text().strip(),
			
 
				+                        'is_table_name': element.get_text().strip().endswith('表')
			
 
				+                    })
			
 
				+        self.detail_df = pd.DataFrame(self.details)
			
 
				+
			
 
				+    def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
			
 
				+        """尝试将表添加到结果列中，有两种情况，直接添加一个新表；拼接最后一个表
			
 
				+        @table
			
 
				+        """
			
 
				+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
			
 
				+
			
 
				+        if new:
			
 
				+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				+            return
			
 
				+
			
 
				+        tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
			
 
				+        if len(table) > 1:
			
 
				+            second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
			
 
				+        else:
			
 
				+            second = None
			
 
				+        # pprint(first)
			
 
				+        if not self.tables or len(HEADERS & set(first)) > 2:
			
 
				+            # pprint("找到大量表头元素，判断为独立表头，生成新表!")
			
 
				+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				+        elif second and (len(HEADERS & set(second)) > 2):
			
 
				+            # pprint("找到大量表头元素，判断为独立表头，生成新表!")
			
 
				+            if not table_name:
			
 
				+                first = [i for i in first if i]
			
 
				+                if len(first) == 1:
			
 
				+                    table_name = "".join(first)
			
 
				+            self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
			
 
				+        elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
			
 
				+            # pprint("有空列，不是单独表，直接合并")
			
 
				+            self.tables[-1]['page_numbers'].append(page_number)
			
 
				+            self.tables[-1]['table'].extend(table)
			
 
				+        else:
			
 
				+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
			
 
				+    
			
 
				+
			
 
				+    def parse_table(self) -> None:
			
 
				+        """表格解析
			
 
				+        """
			
 
				+        with pdfplumber.open(self.file_path) as pdf:
			
 
				+            for page_number, page_layout in enumerate(pdf.pages):
			
 
				+                # 查询是否存在表格
			
 
				+                tables = page_layout.find_tables()
			
 
				+                # 检测到该页面存在一个表格，对其进行合并判断
			
 
				+                if len(tables) == 1:
			
 
				+                    table = tables[0]
			
 
				+                    x0, y0, x1, y1 = table.bbox
			
 
				+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
			
 
				+                    if table_title_df.empty:
			
 
				+                        self.concat_table(table.extract(), page_number=page_number)
			
 
				+                    else:
			
 
				+                        table_title_name = table_title_df.iloc[0]['text']
			
 
				+                        self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
			
 
				+                    table = tables[0]
			
 
				+                    #self.concat_table(table.extract(), table_title_name)
			
 
				+                # 检测到存在多个表格，对第一个表格进行合并判断之后的表格一定不相干
			
 
				+                elif len(tables) > 1:
			
 
				+                    pass
			
 
				+    
			
 
				+    def parse_table_pro(self, table_path: str = 'all_tables.json') -> str:
			
 
				+        """表格解析
			
 
				+        """
			
 
				+        if self.detail_df == None:
			
 
				+            self.parse_text()
			
 
				+
			
 
				+        with pdfplumber.open(self.file_path) as pdf:
			
 
				+            for page_number, page_layout in enumerate(pdf.pages):
			
 
				+                # 查询是否存在表格
			
 
				+                tables = page_layout.find_tables()
			
 
				+
			
 
				+                if not tables:
			
 
				+                    continue
			
 
				+
			
 
				+                tables_pro = camelot.read_pdf(
			
 
				+                    self.file_path,
			
 
				+                    # flavor='stream',
			
 
				+                    pages=str(page_number+1),
			
 
				+                    # edge_tol=200,
			
 
				+                )
			
 
				+
			
 
				+                if not tables_pro:
			
 
				+                    continue
			
 
				+
			
 
				+                print(len(tables), len(tables_pro))
			
 
				+
			
 
				+                # 检测到该页面存在一个表格，对其进行合并判断
			
 
				+                if (len(tables) != 0) and (len(tables_pro) == 1):
			
 
				+                    print(f"解析PDF{page_number}页的表格")
			
 
				+                    # print(f"解析PDF{page_number}页的表格")
			
 
				+                    table = tables[0]
			
 
				+                    table_pro = tables_pro[0].df.to_dict(orient='split')['data']
			
 
				+                    x0, y0, x1, y1 = table.bbox
			
 
				+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
			
 
				+                    if table_title_df.empty:
			
 
				+                        self.concat_table(table_pro, page_number=page_number)
			
 
				+                    else:
			
 
				+                        table_title_name = table_title_df.iloc[0]['text']
			
 
				+                        self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
			
 
				+                    table = tables[0]
			
 
				+                # 检测到存在多个表格，对第一个表格进行合并判断之后的表格一定不相干
			
 
				+                elif len(tables_pro) > 1:
			
 
				+                    print(f"解析PDF{page_number}页的表格")
			
 
				+                    first_table = tables_pro[0]
			
 
				+                    self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
			
 
				+                    for table_index in range(1, len(tables_pro)):
			
 
				+                        self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
			
 
				+
			
 
				+        with open(table_path, 'w', encoding='utf-8') as fp:
			
 
				+            json.dump(self.tables, fp, indent=4, ensure_ascii=False)
			
 
				+        return table_path
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # pdf_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.pdf'
			
 
				+    # image_dir = 'data/预审查数据/extracted_images'
			
 
				+    # title_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.json'
			
 
				+
			
 
				+    # pdf_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.pdf'
			
 
				+    # image_dir = 'data/预审查数据/extracted_images'
			
 
				+    # title_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
			
 
				+
			
 
				+    # os.makedirs(image_dir, exist_ok=True)
			
 
				+    # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
			
 
				+
			
 
				+    # table_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
			
 
				+    # content_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
			
 
				+    # agent = PdfExtractAttr_(file_path=pdf_path)
			
 
				+
			
 
				+    ## agent.extract_content(content_path=content_path)
			
 
				+    # contents = agent.output_()  
			
 
				+    
			
 
				+    # agent.parse_text()
			
 
				+    # agent.parse_table()
			
 
				+    ## agent.parse_table_pro(table_path=table_path)
			
 
				+    # all_tables = agent.output()
			
 
				+
			
 
				+    import glob
			
 
				+    dir_path = 'data/财报素材'
			
 
				+    for pdf_path in glob.glob(f'{dir_path}/*.pdf'):
			
 
				+        print(pdf_path)
			
 
				+        if '600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告' not in pdf_path: continue
			
 
				+        agent = PdfExtractAttr_(file_path=pdf_path)
			
 
				+
			
 
				+        content_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_content.json'
			
 
				+        agent.extract_content(content_path=content_path)
			
 
				+
			
 
				+        table_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_table.json'
			
 
				+        agent.parse_table_pro(table_path=table_path)
			
 
				+
			
--- a/celery_tasks/matcher.py
+++ b/celery_tasks/matcher.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-27 09:33:01
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-09-06 14:12:50
			
 
				+# @Last Modified time: 2024-12-23 15:07:08
			
 
				 import os
			
 
				 os.environ['TRANSFORMERS_OFFLINE'] = '1'
			
 
				 from typing import List, Union
			
@@ -48,7 +48,7 @@ class Matcher:
 
				         Returns:
			
 
				             text_embedding: 文本向量
			
 
				         """
			
 
				-        encoded_input = self.tokenizer(text, return_tensors='pt')
			
 
				+        encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
			
 
				         with torch.no_grad():
			
 
				             output = self.model(**encoded_input)
			
 
				         text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
			
@@ -64,7 +64,7 @@ class Matcher:
 
				         """
			
 
				         text_embeddings = []
			
 
				         for text in text_list:
			
 
				-            encoded_input = self.tokenizer(text, return_tensors='pt')
			
 
				+            encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
			
 
				             with torch.no_grad():
			
 
				                 output = self.model(**encoded_input)
			
 
				             text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
			
--- a/celery_tasks/responser.py
+++ b/celery_tasks/responser.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-07-24 14:11:01
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-12-05 11:25:15
			
 
				+# @Last Modified time: 2024-12-23 15:50:18
			
 
				 from typing import Optional
			
 
				 from dataclasses import dataclass, asdict
			
 
				 
			
--- a/celery_tasks/tech_instance.py
+++ b/celery_tasks/tech_instance.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-12-03 17:57:33
			
 
				+# @Last Modified time: 2024-12-25 14:29:44
			
 
				 # 技术部分定位
			
 
				 from typing import List, Optional
			
 
				 
			
@@ -99,7 +99,7 @@ def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List
 
				                 'writeName': '',
			
 
				                 'name': supplier,
			
 
				                 'grade': 'B',
			
 
				-                'supplier': f'概括文字(200字以内){comment}',
			
 
				+                'supplier': f'概括文字: {comment}',
			
 
				                 'pages': pages
			
 
				             }]
			
 
				         })
			
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,21 @@
 
				+services:
			
 
				+  web:
			
 
				+    build: .
			
 
				+    command: uvicorn main:app --host 0.0.0.0 --port 8000
			
 
				+    volumes:
			
 
				+      - ./app:/app
			
 
				+    ports:
			
 
				+      - 8000:8000
			
 
				+    depends_on:
			
 
				+      - redis
			
 
				+
			
 
				+  worker:
			
 
				+    build: .
			
 
				+    command: celery -A celery_tasks worker --loglevel=info
			
 
				+    volumes:
			
 
				+      - ./app:/app
			
 
				+    depends_on:
			
 
				+      - redis
			
 
				+
			
 
				+  redis:
			
 
				+    image: "redis:alpine"
			
--- a/poetry.lock
+++ b/poetry.lock
@@ -4667,6 +4667,22 @@ type = "legacy"
 
				 url = "https://pypi.tuna.tsinghua.edu.cn/simple"

			
 
				 reference = "tsinghua"

			
 
				 

			
 
				+[[package]]

			
 
				+name = "python-multipart"

			
 
				+version = "0.0.20"

			
 
				+description = "A streaming multipart parser for Python"

			
 
				+optional = false

			
 
				+python-versions = ">=3.8"

			
 
				+files = [

			
 
				+    {file = "python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104"},

			
 
				+    {file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},

			
 
				+]

			
 
				+

			
 
				+[package.source]

			
 
				+type = "legacy"

			
 
				+url = "https://pypi.tuna.tsinghua.edu.cn/simple"

			
 
				+reference = "tsinghua"

			
 
				+

			
 
				 [[package]]

			
 
				 name = "python-oxmsg"

			
 
				 version = "0.0.1"

			
@@ -6716,4 +6732,4 @@ reference = "tsinghua"
 
				 [metadata]

			
 
				 lock-version = "2.0"

			
 
				 python-versions = ">=3.9, <3.13"

			
 
				-content-hash = "4d686613a747bbfd128678a06a37320f3abf278b13fbcffd15fd16a698c05c23"

			
 
				+content-hash = "ea0f25987f4d8f5da7f99385aef1d47199c17ff6dab8015c07cc239c927c24a4"

			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ markdown = "^3.7"
 
				 langchain-huggingface = "^0.1.2"
			
 
				 langchain-chroma = "^0.1.4"
			
 
				 faiss-cpu = "^1.9.0.post1"
			
 
				+python-multipart = "^0.0.20"
			
 
				 
			
 
				 [[tool.poetry.source]]
			
 
				 name = "tsinghua"
			
--- a/run.py
+++ b/run.py
@@ -0,0 +1,22 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: privacy
			
 
				+# @Date:   2024-09-03 11:41:18
			
 
				+# @Last Modified by:   privacy
			
 
				+# @Last Modified time: 2024-09-05 11:07:42
			
 
				+import uvicorn
			
 
				+
			
 
				+original_callback = uvicorn.main.callback
			
 
				+
			
 
				+
			
 
				+def callback(**kwargs):
			
 
				+    from celery.contrib.testing.worker import start_worker
			
 
				+    from celery_tasks import celery_app
			
 
				+
			
 
				+    with start_worker(celery_app, concurrency=2, perform_ping_check=False, loglevel="info"):
			
 
				+        original_callback(**kwargs)
			
 
				+
			
 
				+
			
 
				+uvicorn.main.callback = callback
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    uvicorn.main()