Jelajahi Sumber

Add CI/CD example

sprivacy 7 bulan lalu
induk
melakukan
41baeabf79

+ 136 - 136
api.py

@@ -1,136 +1,136 @@
-# -*- coding: utf-8 -*-
-# @Author: privacy
-# @Date:   2024-09-03 11:24:56
-# @Last Modified by:   privacy
-# @Last Modified time: 2024-09-30 13:41:28
-import os
-import time
-
-import zipfile
-import requests
-from pydantic import BaseModel
-from celery.result import AsyncResult
-from fastapi import FastAPI, UploadFile, File, Body
-
-from celery_tasks import celery_app
-from celery_tasks.commonprocess import bidding_factor, test_all_files
-from celery_tasks.project_loc import extract_project
-from celery_tasks.commonprocess import full_func
-
-
-app = FastAPI()
-
-
-@app.get('/result')
-def back(taskid):
-    result = AsyncResult(id=taskid, app=celery_app)
-    if result.successful():
-        val = result.get()
-        return "执行完成,结果:%s" % val
-    else:
-        return '正在处理中...'
-
-
-# 检测编码(已完成)
-def decode_path(path):
-    '''zipfile解压出现乱码,将乱码的路径编码为UTF8'''
-    try:
-        path_name = path.decode('utf-8')
-    except Exception:
-        path_name = path.encode('437').decode('gbk')
-        path_name = path_name.encode('utf-8').decode('utf-8')
-    return path_name
-
-
-# 下载招标、投标、专家打分表图片文件
-def get_file(path_name="1@/static/bm/bid_check_bidder/20240808151412_投标文件25mb.zip", save_file='./download'):
-    url = 'http://192.168.1.111:9999/admin/sys-file/download'  # 要下载的文件的URL
-    url = 'http://192.168.1.111:9999/admin/sys-file/preview'
-    json = {"path": path_name}
-
-    os.makedirs(save_file, exist_ok=True)
-    code = 0
-    try:
-        response = requests.get(url, data=json, stream=True)  # 发送GET请求,stream参数指定以流的方式下载文件
-    except Exception:
-        print('下载失败,状态码:', response.status_code)
-        code = 1
-    file_name = path_name.split('/')[-1]
-
-    save_file_path = save_file + '/new_' + file_name
-
-    if response.status_code == 200:  # 检查响应状态码
-        with open(save_file_path, "wb") as fp:
-            fp.write(response.content)
-            print('文件下载完成!')
-    if code != 0:  # 下载失败抓取
-        return save_file_path, code
-    if os.path.isfile(save_file_path) and save_file_path.endswith('.zip'):
-        # 解压方式2:防止乱码
-        tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
-
-        os.makedirs('./cache/' + tempdir, exist_ok=True)
-        file_path_list = []
-        try:
-            with zipfile.ZipFile(save_file_path, allowZip64=True) as zf:
-                # 排除目录文件
-                file_iter = (filename for filename in zf.filelist if os.path.isfile(save_file_path))
-                for filename in file_iter:
-                    # 编码文件名称为 utf 格式
-                    filename.filename = decode_path(filename.filename)  # 防止乱码的操作
-                    zf.extract(filename, "./cache/" + tempdir)
-                    if filename.filename[-1] == '/':
-                        continue
-                    file_path_list.append("./cache/" + tempdir + '/' + filename.filename)
-        except Exception as e:
-            print(e)
-
-        return file_path_list, code
-
-    return save_file_path, code
-
-
-# 预审查、清标
-@app.post("/file_upload")
-async def file_upload(text_list=Body(None)):
-    """
-    {
-        'bidderUnit': '杭州华新机电工程有限公司',
-        'bidderFile': '1@/static/bm/bid_pre_check/20240924172133_三峡投标文件新-华新(1).zip',
-        'buyFile': '1@/static/bm/project/20240924171822_三峡电站左岸厂房桥机远程智能化操作研发与实施重新招标.pdf',
-        'reportFlag': '0',
-        'projectName': '华新',
-        'projectId': '2024-9-24-0'
-    }
-    """
-    try:
-        json_post = eval(text_list)
-    except Exception:
-        json_post = text_list
-
-    buyFile = json_post['buyFile']  # 采购文件
-
-    bidderFile = json_post['bidderFile']  # 投标文件
-
-    try:
-        buy_file_path, code = get_file(buyFile)
-
-        bidder_file_path, code = get_file(bidderFile)
-
-        json_data = {
-            "code": 0,
-            "name": '',
-            "msg": "文件下载成功",
-            "data": {},
-            "ok": True
-        }
-    except Exception:
-        json_data = {
-            "code": 1,
-            "name": '',
-            "msg": "文件下载失败",
-            "data": {},
-            "ok": False
-        }
-
-    return json_data
+# # -*- coding: utf-8 -*-
+# # @Author: privacy
+# # @Date:   2024-09-03 11:24:56
+# # @Last Modified by:   privacy
+# # @Last Modified time: 2024-12-23 14:38:28
+# import os
+# import time
+
+# import zipfile
+# import requests
+# from pydantic import BaseModel
+# from celery.result import AsyncResult
+# from fastapi import FastAPI, UploadFile, File, Body
+
+# from celery_tasks import celery_app
+# from celery_tasks.commonprocess import bidding_factor, test_all_files
+# from celery_tasks.project_loc import extract_project
+# from celery_tasks.commonprocess import full_func
+
+
+# app = FastAPI()
+
+
+# @app.get('/result')
+# def back(taskid):
+#     result = AsyncResult(id=taskid, app=celery_app)
+#     if result.successful():
+#         val = result.get()
+#         return "执行完成,结果:%s" % val
+#     else:
+#         return '正在处理中...'
+
+
+# # 检测编码(已完成)
+# def decode_path(path):
+#     '''zipfile解压出现乱码,将乱码的路径编码为UTF8'''
+#     try:
+#         path_name = path.decode('utf-8')
+#     except Exception:
+#         path_name = path.encode('437').decode('gbk')
+#         path_name = path_name.encode('utf-8').decode('utf-8')
+#     return path_name
+
+
+# # 下载招标、投标、专家打分表图片文件
+# def get_file(path_name="1@/static/bm/bid_check_bidder/20240808151412_投标文件25mb.zip", save_file='./download'):
+#     url = 'http://192.168.1.111:9999/admin/sys-file/download'  # 要下载的文件的URL
+#     url = 'http://192.168.1.111:9999/admin/sys-file/preview'
+#     json = {"path": path_name}
+
+#     os.makedirs(save_file, exist_ok=True)
+#     code = 0
+#     try:
+#         response = requests.get(url, data=json, stream=True)  # 发送GET请求,stream参数指定以流的方式下载文件
+#     except Exception:
+#         print('下载失败,状态码:', response.status_code)
+#         code = 1
+#     file_name = path_name.split('/')[-1]
+
+#     save_file_path = save_file + '/new_' + file_name
+
+#     if response.status_code == 200:  # 检查响应状态码
+#         with open(save_file_path, "wb") as fp:
+#             fp.write(response.content)
+#             print('文件下载完成!')
+#     if code != 0:  # 下载失败抓取
+#         return save_file_path, code
+#     if os.path.isfile(save_file_path) and save_file_path.endswith('.zip'):
+#         # 解压方式2:防止乱码
+#         tempdir = time.strftime("%Y_%m_%dT%H_%M_%S")
+
+#         os.makedirs('./cache/' + tempdir, exist_ok=True)
+#         file_path_list = []
+#         try:
+#             with zipfile.ZipFile(save_file_path, allowZip64=True) as zf:
+#                 # 排除目录文件
+#                 file_iter = (filename for filename in zf.filelist if os.path.isfile(save_file_path))
+#                 for filename in file_iter:
+#                     # 编码文件名称为 utf 格式
+#                     filename.filename = decode_path(filename.filename)  # 防止乱码的操作
+#                     zf.extract(filename, "./cache/" + tempdir)
+#                     if filename.filename[-1] == '/':
+#                         continue
+#                     file_path_list.append("./cache/" + tempdir + '/' + filename.filename)
+#         except Exception as e:
+#             print(e)
+
+#         return file_path_list, code
+
+#     return save_file_path, code
+
+
+# # 预审查、清标
+# @app.post("/file_upload")
+# async def file_upload(text_list=Body(None)):
+#     """
+#     {
+#         'bidderUnit': '杭州华新机电工程有限公司',
+#         'bidderFile': '1@/static/bm/bid_pre_check/20240924172133_三峡投标文件新-华新(1).zip',
+#         'buyFile': '1@/static/bm/project/20240924171822_三峡电站左岸厂房桥机远程智能化操作研发与实施重新招标.pdf',
+#         'reportFlag': '0',
+#         'projectName': '华新',
+#         'projectId': '2024-9-24-0'
+#     }
+#     """
+#     try:
+#         json_post = eval(text_list)
+#     except Exception:
+#         json_post = text_list
+
+#     buyFile = json_post['buyFile']  # 采购文件
+
+#     bidderFile = json_post['bidderFile']  # 投标文件
+
+#     try:
+#         buy_file_path, code = get_file(buyFile)
+
+#         bidder_file_path, code = get_file(bidderFile)
+
+#         json_data = {
+#             "code": 0,
+#             "name": '',
+#             "msg": "文件下载成功",
+#             "data": {},
+#             "ok": True
+#         }
+#     except Exception:
+#         json_data = {
+#             "code": 1,
+#             "name": '',
+#             "msg": "文件下载失败",
+#             "data": {},
+#             "ok": False
+#         }
+
+#     return json_data

+ 135 - 0
app.py

@@ -0,0 +1,135 @@
+#!/usr/bin/python
+# -*- coding=utf-8 -*-
+# @Create Time:		2024-08-05 15:12:31
+# @Last Modified time: 2024-12-25 16:17:05
+import os
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+import uuid
+from typing import List, Literal, Optional, Union
+
+import torch
+import uvicorn
+import numpy as np
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, File, UploadFile, Form, BackgroundTasks, HTTPException
+# from flask import Flask, jsonify, request
+from transformers import AutoTokenizer, AutoModel
+
+from celery_tasks.all_instance import detail_task
+
+
+app = FastAPI()
+# app = Flask(__name__)
+
+
+class ModelLoader:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            # 加载模型代码
+            cls._instance.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
+            cls._instance.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
+        return cls._instance
+
+
+# model_loader = ModelLoader()
+
+
+class EmbeddingInput(BaseModel):
+    input: str
+    model: Optional[str] = "text2vec-base-chinese"
+
+
+class Embeding(BaseModel):
+    embedding: Optional[list] = []
+    index: Optional[int] = 0
+    object: Optional[str] = 'embedding'
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    total_tokens: int
+
+
+class EmbedingResponse(BaseModel):
+    data: List[Embeding]
+    model: Optional[str] = 'text2vec-base-chinese'
+    object: Optional[str] = 'list'
+    usage: Usage
+
+
+class ResponseModel(BaseModel):
+    error_code: Optional[int] = 0
+    error_msg: Optional[str] = ''
+    log_id: Optional[str] = ''
+    result: Optional[dict] = {}
+
+
+# @app.post('/v1/embeddings', response_model=EmbedingResponse)
+# async def create_embeding(request: EmbeddingInput):
+#     encoded_input = model_loader.tokenizer(request.input, return_tensors='pt')
+#     with torch.no_grad():
+#         output = model_loader.model(**encoded_input)
+#     text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0).tolist()
+#     return EmbedingResponse(
+#         data=[
+#             Embeding(embedding=text_embedding)
+#         ],
+#         usage=Usage(
+#             prompt_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1],
+#             total_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1]
+#         )
+#     )
+
+
+# @app.route('/v1/embeddings', methods=['POST'])
+# def create_embeding(request: EmbeddingInput):
+#     encoded_input = model_loader.tokenizer(request.input, return_tensors='pt')
+#     with torch.no_grad():
+#         output = model_loader.model(**encoded_input)
+#     text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0).tolist()
+#     return EmbedingResponse(
+#         data=[
+#             Embeding(embedding=text_embedding)
+#         ],
+#         usage=Usage(
+#             prompt_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1],
+#             total_tokens=encoded_input.input_ids.shape[0] * encoded_input.input_ids.shape[1]
+#         )
+#     )
+
+
+@app.post('/detail_check', response_model=ResponseModel)
+async def predict(background_task: BackgroundTasks, projectId: str = Form(), projectName: str = Form(), bidderUnit: str = Form(), zb_filename: str = Form(), tb_filename: str = Form(), files: List[UploadFile] = File(...)):
+    for file in files:
+        if file.filename == zb_filename:
+            print('招标文件')
+            zb_file = f'./tmp/zb_file-{uuid.uuid4()}.pdf'
+            zb_res = await file.read()
+            with open(zb_file, 'wb') as f:
+                f.write(zb_res)
+        elif file.filename == tb_filename:
+            print('投标文件')
+            tb_file = f'./tmp/tb_file-{uuid.uuid4()}.json'
+            tb_res = await file.read()
+            with open(tb_file, 'wb') as f:
+                f.write(tb_res)
+        else:
+            return ResponseModel(error_code=1, error_msg='未识别文件')
+    background_task.add_task(detail_task, zb_file=zb_file, tb_file=tb_file, tb_filename=tb_filename, projectId=projectId, project=projectName, supplier=bidderUnit)
+    return ResponseModel(result={"task_id": f"{uuid.uuid4()}"})
+
+
+# @app.route('/detail_check', methods=['POST'])
+# def predict():
+#     tb_file = request.files['tb']
+#     zb_file = request.files['zb']
+#     tb_bytes = tb_file.read()
+#     zb_bytes = zb_file.read()
+#     return ResponseModel(result={"task_id": "T000001"})
+
+
+if __name__ == '__main__':
+    uvicorn.run(app, host='0.0.0.0', port=5000)

+ 7 - 4
celery_tasks/LLMAgent.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-12-02 16:50:32
+# @Last Modified time: 2024-12-23 18:17:53
 import re
 import json
 from enum import Enum
@@ -158,8 +158,10 @@ def get_proj(input_json: dict, standard: str):
 
     client = instructor.from_openai(
         OpenAI(
-            base_url='http://180.76.147.97:11434/v1',
-            api_key='ollama'
+            # base_url='http://180.76.147.97:11434/v1',
+            base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+            # api_key='ollama'
+            api_key='sk-45971a6af6d94ccd89321f8f6d370b38'
         ),
         mode=instructor.Mode.JSON,
     )
@@ -170,7 +172,8 @@ def get_proj(input_json: dict, standard: str):
     ]
 
     response = client.chat.completions.create(
-        model='qwen2.5:7b',
+        # model='qwen2.5:7b',
+        model='qwen-turbo',
         # model='wangshenzhi/llama3.1_8b_chinese_chat:latest',
         response_model=ResInfo,
         messages=messages,

+ 3 - 2
celery_tasks/__init__.py

@@ -2,12 +2,13 @@
 # @Author: privacy
 # @Date:   2024-08-27 11:19:15
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-29 13:32:08
+# @Last Modified time: 2024-12-27 11:35:29
 from celery import Celery
 
 celery_app = Celery(
     'tasks',
-    broker='redis://:test@127.0.0.1:6379/0',
+    broker='redis://:redis123@192.168.1.202:6387/3',
+    # broker='redis://:test@127.0.0.1:6379/0',
     # backend='redis://:test@127.0.0.1:6379/1',
     backend='db+sqlite:///celeryresults.sqlite3',
     include=[  # 执行任务库,代表所能执行的所有任务,即通过@celery_app.task修饰的所有函数

+ 68 - 0
celery_tasks/all_instance.py

@@ -0,0 +1,68 @@
+#!/usr/bin/python
+# -*- coding=utf-8 -*-
+# @Create Time:		2024-12-23 15:22:39
+# @Last Modified time: 2024-12-25 16:17:09
+import uuid
+import time
+import json
+import requests
+from .get_tender_info import PdfExtractAttr_
+from .document_ import DocumentPreReview
+from .busi_instance import busi_loc
+from .tech_instance import tech_loc
+from .parse_textmind_result import get_ocr_new
+
+
+def detail_task(zb_file: str, tb_file: str, tb_filename: str, projectId: str, project: str, supplier: str):
+    host = '192.168.1.111:9999'
+
+    detail_check_url = f'http://{host}/bm/alg/result_check_detail'
+
+    agent = PdfExtractAttr_(file_path=zb_file)
+    zb_table_path = f'./tmp/zb_table-{uuid.uuid4()}.json'
+    agent.parse_table_pro(table_path=zb_table_path)
+
+    dpr = DocumentPreReview(zb_table_path)
+    scrutinize_dict = dpr.get_table()
+    print(scrutinize_dict)
+
+    with open(tb_file, 'r', encoding='utf-8') as fp:
+        raw = json.load(fp)
+    result = get_ocr_new(raw=raw, pretty=True)
+
+    busi_score = busi_loc(
+        scrutinize_dict=scrutinize_dict,
+        outline_dict=result['outline'],
+        title_list=result['title'],
+        table_list=result['tables'],
+        image_list=result['images'],
+        supplier=supplier,
+        project=project,
+        file_name=tb_filename.split('.')[0] + 'pdf'
+    )
+    busi_score['name'] = '商务部分评分标准'
+
+    tech_score = tech_loc(
+        scrutinize_dict=scrutinize_dict,
+        outline_dict=result['outline'],
+        content_list=result['contents'],
+        supplier=supplier,
+        project=project,
+        file_name=tb_filename.split('.')[0] + 'pdf'
+    )
+    tech_score['name'] = '技术部分评分标准'
+
+    detail_data = {
+        'projectId': projectId,
+        'projectName': project,
+        'bidderUnit': supplier,
+        'list': [busi_score, tech_score]
+    }
+
+    print(detail_data)
+
+    detail_check_response = requests.post(url=detail_check_url, json=detail_data)
+
+    print("detail_check_response: ", detail_check_response.json())
+
+    return

+ 12 - 10
celery_tasks/document_.py

@@ -4,20 +4,25 @@
 1. 解析Bidding_document_extract中all_tables.json结果
 '''
 import re
-
+import json
 from celery_tasks.tools import BaseMethods
 
 
 class DocumentPreReview:
-    def _scrutinize_judge(self, tag:str, threshold_value:int=3):
-        ''' Clause number content judgment 
-            商务 技术 报价 评审 评分 标准
+    def __init__(self, table_path: str):
+        with open(table_path, 'r', encoding='utf-8') as fp:
+            self.Bidding_tables = json.load(fp)
+
+    def _scrutinize_judge(self, tag: str, threshold_value: int = 3):
+        '''
+        Clause number content judgment
+        商务 技术 报价 评审 评分 标准
         '''
-        scrutinize_tuple = ("商务","技术","报价","评审","评分","标准","部分")
+        scrutinize_tuple = ("商务", "技术", "报价", "评审", "评分", "标准", "部分")
         hit_num = 0
         for scru in scrutinize_tuple:
-            if scru in tag: hit_num+= 1
-        if hit_num>=threshold_value: return True
+            if scru in tag: hit_num += 1
+        if hit_num >= threshold_value: return True
         else: return False
 
     def check_table(self, all_tables):
@@ -95,9 +100,6 @@ class DocumentPreReview:
                 tables_list.append(partial_form)
         return tables_list
 
-
-
-
     def get_table(self):
         ''' parse the Bidding_tables.json file to get the table data from it.
         '''

+ 573 - 0
celery_tasks/get_tender_info.py

@@ -0,0 +1,573 @@
+'''招标文件内容提取'''
+import pandas as pd
+import numpy as np
+import pdfplumber
+import json
+import os
+import re
+import cv2
+from io import BytesIO
+
+from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
+from pdfminer.high_level import extract_pages
+from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
+from pdfminer.pdftypes import (
+    LITERALS_DCT_DECODE,
+    LITERALS_JBIG2_DECODE,
+    LITERALS_JPX_DECODE,
+    LITERALS_FLATE_DECODE,
+)
+from pprint import pprint
+from pdfminer.pdfparser import PDFParser, PDFSyntaxError
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+import pdfplumber
+import camelot
+
+from .tools import RefPageNumberResolver
+
+HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
+HEADERS |= set({'条款号' ,'评审因素' ,'评审标准', ''})
+
+
+
+def is_title(line: str) -> bool:
+    title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
+    if title_word:
+        return True
+    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
+    if title_word:
+        return True
+    return False
+
+PIL_ERROR_MESSAGE = "PIL导入错误"
+def _save_jpeg(image: LTImage, path: str) -> str:
+    """Save a JPEG encoded image"""
+    raw_data = image.stream.get_rawdata()
+    assert raw_data is not None
+
+    path = path + ".jpg"
+
+    with open(path, "wb") as fp:
+        if LITERAL_DEVICE_CMYK in image.colorspace:
+            try: 
+                from PIL import Image, ImageChops  # type: ignore[import] 
+            except ImportError: 
+                raise ImportError(PIL_ERROR_MESSAGE) 
+ 
+            ifp = BytesIO(raw_data)
+            i = Image.open(ifp)
+            i = ImageChops.invert(i)
+            i = i.convert("RGB")
+            i.save(fp, "JPEG")
+        else:
+            fp.write(raw_data)
+
+    return path
+
+def _save_jpeg2000(image: LTImage, path: str) -> str:
+    """Save a JPEG 2000 encoded image"""
+    raw_data = image.stream.get_rawdata()
+    assert raw_data is not None
+
+    path = path + ".png"
+
+    try:
+        from PIL import Image  # type: ignore[import]
+    except ImportError:
+        raise ImportError(PIL_ERROR_MESSAGE)
+
+    # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
+    # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
+    ifp = BytesIO(raw_data)
+    i = Image.open(ifp)
+    opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
+    cv2.imwrite(path, opencv_image)
+    return path
+
+def export_image(image: LTImage, path: str) -> str:
+    """Save an LTImage to disk"""
+    (width, height) = image.srcsize
+
+    filters = image.stream.get_filters()
+
+    if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
+        name = _save_jpeg(image, path)
+        return name
+
+    elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
+        name = _save_jpeg2000(image, path)
+        return name
+
+    data = image.stream.get_data()
+    raw_data = image.stream.get_rawdata()
+
+    if data:
+        if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
+            path += '.jpg'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+        elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
+            path += '.png'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+        elif data[:2] == b'\x42\x4d':
+            path += '.bmp'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+        elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
+            path += '.gif'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+        elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
+            path += '.tiff'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+        else:
+            path += '.unk'
+            with open(path, 'wb') as file:
+                file.write(data)
+            return path
+    elif raw_data:
+        if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
+            path += '.jpg'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+        elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
+            path += '.png'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+        elif raw_data[:2] == b'\x42\x4d':
+            path += '.bmp'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+        elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
+            path += '.gif'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+        elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
+            path += '.tiff'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+        else:
+            path += '.unk'
+            with open(path, 'wb') as file:
+                file.write(raw_data)
+            return path
+    else:
+        return None
+
+def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
+    texts = []
+    images = []
+    # 读取PDF文件并提取页面
+    for page_number, page_layout in enumerate(extract_pages(pdf_path)):
+        title_index = 0
+        image_index = 0
+        for element in page_layout:
+            if isinstance(element, LTLine):
+                pass
+            elif isinstance(element, LTRect):
+                pass
+            elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
+                text = element.get_text().strip()
+                # # 假设标题通常是一行且字体较大
+                if text and (is_title(text) or element.height > 15):
+                    texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
+                    title_index += 1
+            # elif isinstance(element, LTFigure):
+            #     for e_obj in element._objs:
+            #         if isinstance(e_obj, LTImage):
+            #             # 提取图片数据
+            #             image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
+            #             image_file = export_image(e_obj, image_file)
+            #             images.append(image_file)
+            #             pprint(f'Image saved: {image_file}')
+            #             image_index += 1
+
+    with open(title_path, 'w', encoding='utf-8') as fp:
+        json.dump(texts, fp, indent=4, ensure_ascii=False)
+    return title_path,image_dir
+
+from typing import Optional, List
+def parse_title(file_path: str, title_path: Optional[str] = None) -> list:
+        """
+        标题解析,用于报价唯一
+
+        Args:
+            title_path: 保存路径
+
+        Returns:
+            results:    标题列表
+        """
+        results = []
+
+        seq_num = 0
+
+        for page_number, page_layout in enumerate(extract_pages(file_path)):
+            title_index = 0
+            for element in page_layout:
+                if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
+                    text = element.get_text().strip()
+                    if text and (is_title(text) or element.height > 15):
+                        results.append({
+                            'index': title_index,
+                            'page_number': page_number,
+                            'bbox': element.bbox,
+                            'text': text,
+                            'title': text,
+                            'seq_num': seq_num
+                        })
+                        seq_num += 1
+                        title_index += 1
+
+        if title_path:
+            with open(title_path, 'w', encoding='utf-8') as fp:
+                json.dump(results, fp, indent=4, ensure_ascii=False)
+
+        return title_path
+
+def parse_image(file_path: str, image_dir: str, image_meta_path: str) -> List[dict]:
+    """
+    解析PDF中的图片
+    Args:
+        image_dir:  解析目录
+
+    Returns:
+        image_list: 图片列表
+    """
+    image_list = []
+
+    for page_number, page_layout in enumerate(extract_pages(file_path)):
+        image_index = 0
+        for element in page_layout:
+            if isinstance(element, LTFigure):
+                for e_obj in element._objs:
+                    if isinstance(e_obj, LTImage):
+                        # 提取图片数据
+                        image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
+                        image_file = export_image(e_obj, image_file)
+                        image_list.append({
+                            "image_index": image_index,
+                            "page_number": page_number,
+                            "image_name": image_file
+                        })
+                        image_index += 1
+
+    if image_meta_path:
+        with open(image_meta_path, 'w', encoding='utf-8') as fp:
+            json.dump(image_list, fp, indent=4, ensure_ascii=False)
+
+    return image_meta_path
+
+
+def table_parse(pdf_path: str, 
+                title_path: str, 
+                start_title: str = '第三章 评标办法(综合评估法)', 
+                end_title: str = '第四章 合同条款及格式', 
+                table_path: str = None, 
+                start_page_number: int = None, 
+                end_page_number: int = None
+            ) -> list:
+    """pdf表格解析功能
+    @pdf_path
+    @title_path
+    @start_title
+    @end_title
+    @table_path
+    @start_page_number
+    @end_page_number
+    """
+    tables = []
+
+    if (start_page_number == None) or (end_page_number == None):
+        df = pd.read_json(title_path)
+        start_page_number = df[df['text'] == start_title].page_number.max()
+        end_page_number = df[df['text'] == end_title].page_number.max()
+
+    def concat_table(tables, table):
+        """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
+        @tables
+        @table
+        """
+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
+        tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
+        if len(table) > 1:
+            second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
+        # pprint(first)
+        if len(HEADERS & set(first)) > 2:
+            # pprint("找到大量表头元素,判断为独立表头,生成新表!")
+            tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
+        elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
+            # pprint("有空列,不是单独表,直接合并")
+            tables[-1]['page_numbers'].append(i)
+            tables[-1]['table'].extend(table)
+        else:
+            tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
+        return tables
+
+    with pdfplumber.open(pdf_path) as pdf:
+        print(start_page_number, end_page_number)
+        for i in range(start_page_number, end_page_number):
+            for table in pdf.pages[i].extract_tables():
+                tables = concat_table(tables, table)
+
+    with open(table_path, 'w', encoding='utf-8') as fp:
+        json.dump(tables, fp, indent=4, ensure_ascii=False)
+
+    return table_path
+
+
+class PdfExtractAttr_(object):
+    def __init__(self, file_path: str):
+        """PDF文件解析
+        @file_path
+        """
+        super(PdfExtractAttr_, self).__init__()
+        self.file_path = file_path
+        self.details = []
+        self.tables = []
+        self.content = []
+        self.chapters = []
+        self.references = []
+        self.detail_df = None
+        self.outlines = None
+
+    def parse_outline(self):
+        """PDF大纲解析
+        """
+        results = []
+        with open(self.file_path, "rb") as fp:
+            try:
+                parser = PDFParser(fp)
+                document = PDFDocument(parser)
+                ref_pagenum_resolver = RefPageNumberResolver(document)
+                outlines = document.get_outlines()
+                for (level, title, dest, a, se) in outlines:
+                    if dest:
+                        page_num = ref_pagenum_resolver.resolve(dest)
+                    elif a:
+                        page_num = ref_pagenum_resolver.resolve(a)
+                    elif se:
+                        page_num = ref_pagenum_resolver.resolve(se)
+                    else:
+                        page_num = None
+                    results.append({'level': level, 'title': title, 'page_number': page_num})
+            except PDFNoOutlines:
+                print("No outlines found.")
+            except PDFSyntaxError:
+                print("Corrupted PDF or non-PDF file.")
+            finally:
+                parser.close()
+
+        with open('outlines.json', 'w', encoding='utf-8') as op:
+            json.dump(results, op, indent=4, ensure_ascii=False)
+
+        # print(results)
+    
+    def extract_content(self, content_path: str = None) -> list:
+        with pdfplumber.open(self.file_path) as pdf:
+            for page in pdf.pages:
+                self.content.append({
+                    'page_number': page.page_number - 1,
+                    'text': page.extract_text()
+                })
+        
+        
+        with open(content_path, 'w', encoding='utf-8') as fp:
+            json.dump(self.content, fp, indent=4, ensure_ascii=False)
+
+        return content_path
+
+    def parse_text(self) -> None:
+        """文本解析
+        """
+        for page_number, page_layout in enumerate(extract_pages(self.file_path)):
+            for element in page_layout:
+                if isinstance(element, LTTextBoxHorizontal):
+                    # 距离左侧
+                    left = element.x0
+                    # 距离右侧
+                    right = (page_layout.width - element.x1)
+                    # 距离上侧
+                    top = (page_layout.height - element.y1)
+                    # 距离下侧
+                    button = element.y0
+                    # 文本宽度
+                    width = element.width
+                    if (left > right) and (abs(left - right) > 100):
+                        alignment = 'right'
+                    elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
+                        alignment = 'center'
+                    else:
+                        alignment = 'left'
+                    self.details.append({
+                        'page_number': page_number,
+                        'index': element.index,
+                        'x0': element.bbox[0],
+                        'y0': element.bbox[1],
+                        'x1': element.bbox[2],
+                        'y1': element.bbox[3],
+                        'alignment': alignment,
+                        'lines': len(element._objs),
+                        'text': element.get_text().strip(),
+                        'is_table_name': element.get_text().strip().endswith('表')
+                    })
+        self.detail_df = pd.DataFrame(self.details)
+
+    def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
+        """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
+        @table
+        """
+        first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
+
+        if new:
+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
+            return
+
+        tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
+        if len(table) > 1:
+            second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
+        else:
+            second = None
+        # pprint(first)
+        if not self.tables or len(HEADERS & set(first)) > 2:
+            # pprint("找到大量表头元素,判断为独立表头,生成新表!")
+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
+        elif second and (len(HEADERS & set(second)) > 2):
+            # pprint("找到大量表头元素,判断为独立表头,生成新表!")
+            if not table_name:
+                first = [i for i in first if i]
+                if len(first) == 1:
+                    table_name = "".join(first)
+            self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
+        elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
+            # pprint("有空列,不是单独表,直接合并")
+            self.tables[-1]['page_numbers'].append(page_number)
+            self.tables[-1]['table'].extend(table)
+        else:
+            self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
+    
+
+    def parse_table(self) -> None:
+        """表格解析
+        """
+        with pdfplumber.open(self.file_path) as pdf:
+            for page_number, page_layout in enumerate(pdf.pages):
+                # 查询是否存在表格
+                tables = page_layout.find_tables()
+                # 检测到该页面存在一个表格,对其进行合并判断
+                if len(tables) == 1:
+                    table = tables[0]
+                    x0, y0, x1, y1 = table.bbox
+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
+                    if table_title_df.empty:
+                        self.concat_table(table.extract(), page_number=page_number)
+                    else:
+                        table_title_name = table_title_df.iloc[0]['text']
+                        self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
+                    table = tables[0]
+                    #self.concat_table(table.extract(), table_title_name)
+                # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
+                elif len(tables) > 1:
+                    pass
+    
+    def parse_table_pro(self, table_path: str = 'all_tables.json') -> str:
+        """表格解析
+        """
+        if self.detail_df == None:
+            self.parse_text()
+
+        with pdfplumber.open(self.file_path) as pdf:
+            for page_number, page_layout in enumerate(pdf.pages):
+                # 查询是否存在表格
+                tables = page_layout.find_tables()
+
+                if not tables:
+                    continue
+
+                tables_pro = camelot.read_pdf(
+                    self.file_path,
+                    # flavor='stream',
+                    pages=str(page_number+1),
+                    # edge_tol=200,
+                )
+
+                if not tables_pro:
+                    continue
+
+                print(len(tables), len(tables_pro))
+
+                # 检测到该页面存在一个表格,对其进行合并判断
+                if (len(tables) != 0) and (len(tables_pro) == 1):
+                    print(f"解析PDF{page_number}页的表格")
+                    # print(f"解析PDF{page_number}页的表格")
+                    table = tables[0]
+                    table_pro = tables_pro[0].df.to_dict(orient='split')['data']
+                    x0, y0, x1, y1 = table.bbox
+                    table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
+                    if table_title_df.empty:
+                        self.concat_table(table_pro, page_number=page_number)
+                    else:
+                        table_title_name = table_title_df.iloc[0]['text']
+                        self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
+                    table = tables[0]
+                # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
+                elif len(tables_pro) > 1:
+                    print(f"解析PDF{page_number}页的表格")
+                    first_table = tables_pro[0]
+                    self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
+                    for table_index in range(1, len(tables_pro)):
+                        self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
+
+        with open(table_path, 'w', encoding='utf-8') as fp:
+            json.dump(self.tables, fp, indent=4, ensure_ascii=False)
+        return table_path
+
+
+if __name__ == '__main__':
+    # pdf_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.pdf'
+    # image_dir = 'data/预审查数据/extracted_images'
+    # title_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.json'
+
+    # pdf_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.pdf'
+    # image_dir = 'data/预审查数据/extracted_images'
+    # title_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
+
+    # os.makedirs(image_dir, exist_ok=True)
+    # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
+
+    # table_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
+    # content_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
+    # agent = PdfExtractAttr_(file_path=pdf_path)
+
+    ## agent.extract_content(content_path=content_path)
+    # contents = agent.output_()  
+    
+    # agent.parse_text()
+    # agent.parse_table()
+    ## agent.parse_table_pro(table_path=table_path)
+    # all_tables = agent.output()
+
+    import glob
+    dir_path = 'data/财报素材'
+    for pdf_path in glob.glob(f'{dir_path}/*.pdf'):
+        print(pdf_path)
+        if '600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告' not in pdf_path: continue
+        agent = PdfExtractAttr_(file_path=pdf_path)
+
+        content_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_content.json'
+        agent.extract_content(content_path=content_path)
+
+        table_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_table.json'
+        agent.parse_table_pro(table_path=table_path)
+

+ 3 - 3
celery_tasks/matcher.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-27 09:33:01
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-06 14:12:50
+# @Last Modified time: 2024-12-23 15:07:08
 import os
 os.environ['TRANSFORMERS_OFFLINE'] = '1'
 from typing import List, Union
@@ -48,7 +48,7 @@ class Matcher:
         Returns:
             text_embedding: 文本向量
         """
-        encoded_input = self.tokenizer(text, return_tensors='pt')
+        encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
         with torch.no_grad():
             output = self.model(**encoded_input)
         text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
@@ -64,7 +64,7 @@ class Matcher:
         """
         text_embeddings = []
         for text in text_list:
-            encoded_input = self.tokenizer(text, return_tensors='pt')
+            encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
             with torch.no_grad():
                 output = self.model(**encoded_input)
             text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))

+ 1 - 1
celery_tasks/responser.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-07-24 14:11:01
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-12-05 11:25:15
+# @Last Modified time: 2024-12-23 15:50:18
 from typing import Optional
 from dataclasses import dataclass, asdict
 

+ 2 - 2
celery_tasks/tech_instance.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-12-03 17:57:33
+# @Last Modified time: 2024-12-25 14:29:44
 # 技术部分定位
 from typing import List, Optional
 
@@ -99,7 +99,7 @@ def tech_loc(scrutinize_dict: dict, outline_dict: List[dict], content_list: List
                 'writeName': '',
                 'name': supplier,
                 'grade': 'B',
-                'supplier': f'概括文字(200字以内){comment}',
+                'supplier': f'概括文字: {comment}',
                 'pages': pages
             }]
         })

+ 21 - 0
docker-compose.yaml

@@ -0,0 +1,21 @@
+services:
+  web:
+    build: .
+    command: uvicorn main:app --host 0.0.0.0 --port 8000
+    volumes:
+      - ./app:/app
+    ports:
+      - 8000:8000
+    depends_on:
+      - redis
+
+  worker:
+    build: .
+    command: celery -A celery_tasks worker --loglevel=info
+    volumes:
+      - ./app:/app
+    depends_on:
+      - redis
+
+  redis:
+    image: "redis:alpine"

+ 17 - 1
poetry.lock

@@ -4667,6 +4667,22 @@ type = "legacy"
 url = "https://pypi.tuna.tsinghua.edu.cn/simple"
 reference = "tsinghua"
 
+[[package]]
+name = "python-multipart"
+version = "0.0.20"
+description = "A streaming multipart parser for Python"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104"},
+    {file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "tsinghua"
+
 [[package]]
 name = "python-oxmsg"
 version = "0.0.1"
@@ -6716,4 +6732,4 @@ reference = "tsinghua"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9, <3.13"
-content-hash = "4d686613a747bbfd128678a06a37320f3abf278b13fbcffd15fd16a698c05c23"
+content-hash = "ea0f25987f4d8f5da7f99385aef1d47199c17ff6dab8015c07cc239c927c24a4"

+ 1 - 0
pyproject.toml

@@ -29,6 +29,7 @@ markdown = "^3.7"
 langchain-huggingface = "^0.1.2"
 langchain-chroma = "^0.1.4"
 faiss-cpu = "^1.9.0.post1"
+python-multipart = "^0.0.20"
 
 [[tool.poetry.source]]
 name = "tsinghua"

+ 22 - 0
run.py

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-09-03 11:41:18
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-09-05 11:07:42
+import uvicorn
+
+original_callback = uvicorn.main.callback
+
+
+def callback(**kwargs):
+    from celery.contrib.testing.worker import start_worker
+    from celery_tasks import celery_app
+
+    with start_worker(celery_app, concurrency=2, perform_ping_check=False, loglevel="info"):
+        original_callback(**kwargs)
+
+
+uvicorn.main.callback = callback
+
+if __name__ == '__main__':
+    uvicorn.main()