Jelajahi Sumber

Trans Text Mind to Standard format

sprivacy 10 bulan lalu
induk
melakukan
ad54b63f38

+ 209 - 0
busi_instance.py

@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-08-30 11:17:21
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-09-26 14:32:39
+
+"""
+商务部分
+"""
+import re
+from typing import List, Optional
+
+from celery_tasks.LLMAgent import get_proj
+from celery_tasks.project_loc import extract_project
+from celery_tasks.text_extractor import similar_match
+from celery_tasks.extract_financial_report import extract_financial_report
+
+
+def get_score(comment: str, standard: str):
+    if re.findall(r'A~D|A~D|A、B、C', standard):
+        try:
+            return re.findall('([A-D])级', comment).pop()
+        except Exception:
+            return 'B'
+    elif re.findall(r'\d+\s?分', standard):
+        try:
+            return re.findall(r'(\d+)\s?分', comment).pop()
+        except Exception:
+            return "60"
+
+
+def busi_loc(scrutinize_dict: dict, outline_dict: List[dict], title_list: List[dict], table_list: List[dict], image_list: List[dict], supplier: str, project: str = None, file_name: str = None) -> Optional[List[dict]]:
+    """
+    投标商务部分定位
+
+    Args:
+        scrutinize_dict:   详细评审
+        tender_file:    投标文件
+    Returns:
+        result: 商务部分详审结果
+    """
+    part = None
+    # 从详审大纲中获取商务评分方法
+    for key in scrutinize_dict.keys():
+        if '商务' in key:
+            part = key
+            break
+
+    # 没有找到商务评审方法则直接返回
+    if not part:
+        return None
+
+    result = {
+        'scoringCriteria': []
+    }
+
+    for item in scrutinize_dict[key]:
+        if '信用' in item['评分因素']:
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': get_score(comment="B级(70分)", standard=item['评分标准']),
+                    'pages': []
+                }]
+            })
+        elif '业绩' in item['评分因素']:
+            # 项目业绩表查询
+            proj_list = extract_project(table_list, instances=['合同金额', '合同价格', '发包人名称', '项目规模', '合同时间'])
+            # 定位信息格式化
+            title_sims = [{'fileName': file_name, 'pageKey': '', 'pageStart': str(proj['page_numbers'][-1]), 'pageEnd': str(proj['page_numbers'][-1])} for proj in proj_list]
+            # # 打分
+            comment = get_proj(input_json=proj_list, standard=item['评分标准'])
+            # 结果回传
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': get_score(comment=comment, standard=item['评分标准']),
+                    'pages': title_sims
+                }]
+            })
+        elif '财务' in item['评分因素']:
+            financial_list = extract_financial_report(
+                title_list=title_list,
+                table_list=table_list,
+                image_list=image_list,
+                year=2022
+            )
+
+            title_sims = []
+            starts = []
+
+            for _ in financial_list:
+                for page in _['pages']:
+                    starts.append(page[0])
+
+            for page in set(starts):
+                title_sims.append({
+                    "fileName": file_name,
+                    "pageKey": "",
+                    "pageEnd": page,
+                    "pageStart": page,
+                })
+
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': 'B',
+                    'supplier': '3个关键的财务指标',
+                    'pages': title_sims
+                }]
+            })
+        elif '报价' in item['评分因素']:
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': 'B',
+                    'pages': []
+                }]
+            })
+        elif '完整性' in item['评分因素']:
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': 'B',
+                    'pages': []
+                }]
+            })
+        elif '涉密' in item['评分因素']:
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': get_score(comment="A级(100分)", standard=item['评分标准']),
+                    'pages': []
+                }]
+            })
+        else:
+            title_sims = similar_match(outline_dict, [item['评分因素']], key='title')
+
+            pages = [{'fileName': file_name, 'pageStart': str(sim['page_number']), 'pageEnd': str(sim['page_number']), 'pageKey': '', 'text': sim['title'], 'score': sim['相似度']} for sim in title_sims]
+
+            result['scoringCriteria'].append({
+                'scoringFactors': item['评分因素'],
+                'scoringStandard': item['评分标准'],
+                'percentage': item['权重'],
+                'suppliers': [{
+                    'name': supplier,
+                    'grade': get_score(comment="B级(70分)", standard=item['评分标准']),
+                    'pages': pages,
+                }]
+            })
+
+    return result
+
+
+if __name__ == '__main__':
+    import os
+    import json
+    from glob import glob
+    from pprint import pprint
+    with open('bidding_dataset.json', 'r', encoding='utf-8') as fp:
+        scrutinizes = json.load(fp)
+    for project in scrutinizes.keys():
+        scrutinize_dict = scrutinizes[project]
+        for file in glob(f'./data/0预审查初审详审测试数据/{project}/*/*-outline.json'):
+            with open(file, 'r', encoding='utf-8') as fp:
+                outline_dict = json.load(fp)
+            if outline_dict == []:
+                os.remove(file)
+                continue
+            with open(file.replace('outline.json', 'title.json'), 'r', encoding='utf-8') as fp:
+                title_list = json.load(fp)
+            with open(file.replace('outline.json', 'table.json'), 'r', encoding='utf-8') as fp:
+                table_list = json.load(fp)
+            with open(file.replace('outline.json', 'image.json'), 'r', encoding='utf-8') as fp:
+                image_list = json.load(fp)
+
+            supplier = file.split('\\')[-2]
+
+            pprint(
+                busi_loc(
+                    scrutinize_dict=scrutinize_dict,
+                    outline_dict=outline_dict,
+                    title_list=title_list,
+                    table_list=table_list,
+                    image_list=image_list,
+                    supplier=supplier,
+                    project=project,
+                    file_name=file
+                )
+            )
+            exit(0)

+ 42 - 54
celery_tasks/LLMAgent.py

@@ -2,13 +2,16 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-18 14:23:12
+# @Last Modified time: 2024-09-19 13:56:44
 import re
 import json
+from enum import Enum
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
+import instructor
 from openai import OpenAI
+from pydantic import BaseModel
 
 
 class BaseLlmConfig(ABC):
@@ -130,70 +133,55 @@ class LLMAgent(LLMBase):
 
 
 def get_proj(input_json: dict, standard: str):
-    agent = LLMAgent(
-        config=BaseLlmConfig(
+    # agent = LLMAgent(
+    #     config=BaseLlmConfig(
+    #         base_url='http://180.76.147.97:11434/v1',
+    #         # model='qwen2:7b',
+    #         model='qwen2.5:7b',
+    #         # model='sam4096/qwen2tools:latest',
+    #         # model='wangshenzhi/llama3.1_8b_chinese_chat:latest',
+    #         temperature=1.0,
+    #         max_tokens=32 * 1024
+    #     )
+    # )
+
+    class LevelEnum(str, Enum):
+        A = 'A'
+        B = 'B'
+        C = 'C'
+        D = 'D'
+
+    class ResInfo(BaseModel):
+        最终得分: int
+        最终等级: LevelEnum
+        评价原因: str
+
+    client = instructor.from_openai(
+        OpenAI(
             base_url='http://180.76.147.97:11434/v1',
-            # model='qwen2:7b',
-            # model='sam4096/qwen2tools:latest',
-            model='wangshenzhi/llama3.1_8b_chinese_chat:latest',
-            temperature=1.0,
-            max_tokens=4096
-        )
+            api_key='ollama'
+        ),
+        mode=instructor.Mode.JSON,
     )
 
-    
-    # tools = [{
-    #     "type": "function",
-    #     "function": {
-    #         "name": "get_score",
-    #         "description": "根据评分标准从备选评价中获取评分",
-    #         "parameters": {
-    #             "type": "object",
-    #             "properties": {
-    #                 "comment": {
-    #                     "type": "string",
-    #                     "description": "备选评价"
-    #                 },
-    #                 "standard": {
-    #                     "type": "string",
-    #                     "description": "评分标准"
-    #                 }
-    #             },
-    #             "required": ['comment', 'standard']
-    #         }
-    #     }
-    # }]
-
-    # tool_maps = {"get_score": get_score}
-
     messages = [
         {"role": "system", "content": "你是一位优秀的数据分析师"},
-        {"role": "user", "content": "Q: 现在有这样一个数据 input_json: %s 数据集以JSON形式呈现,在数据集 input_json 上分析结果,%s,请一步步进行推理并得出结论。如果无法得出结论,则返回一个中等的评价,默认的A、B、C三个等级信用得分分别为100、80、70分。 最终的答案以 \\boxed{} 开头,不要讲多余的废话。" % (input_json, standard)}
+        {"role": "user", "content": "Q: 现在有这样一个数据 input_json: %s 数据集以JSON形式呈现,在数据集 input_json 上分析结果,%s,请一步步进行推理并得出结论。如果无法得出结论,则返回一个中等的评价。" % (input_json, standard)}
     ]
 
-    response = agent.generate_response(
-        messages=messages,
-    )
-
-    # messages.append({"role": "user", "content": f"comment: {comment}"})
-
     # response = agent.generate_response(
     #     messages=messages,
-    #     tools=tools
     # )
 
-    # tool_results = []
-
-    # for tool in response["tool_calls"]:
-    #     tool_results.append(tool_maps[tool["name"]](**tool["arguments"]))
-
-    # print(f"工具输出结果为: {tool_results}")
-
-    # messages.append({"role": "tool", "content": f"工具输出结果为: {tool_results}"})
+    response = client.chat.completions.create(
+        model='qwen2.5:7b',
+        # model='wangshenzhi/llama3.1_8b_chinese_chat:latest',
+        response_model=ResInfo,
+        messages=messages,
+        max_retries=3
+    )
 
-    # response = agent.generate_response(
-    #     messages=messages,
-    # )
+    print(response)
 
     return response
 

+ 6 - 2
celery_tasks/extract_financial_report.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-05 15:04:14
+# @Last Modified time: 2024-09-26 14:38:53
 import os
 import re
 
@@ -50,11 +50,15 @@ def extract_financial_report(title_list: list, table_list: list, image_list: lis
         table_list=table_list,
         image_list=image_list,
         instances=[
-            '财务状况', '{}年审计报告'.format(year - 1),
+            '财务状况',
+            '近年财务状况表',
+            '{}年审计报告'.format(year - 1),
             '{}年审计报告'.format(year - 2)
         ]
     )
 
+    print(instances)
+
     results = []
 
     for item in instances:

+ 2 - 2
celery_tasks/get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-05 16:29:06
+# @Last Modified time: 2024-09-25 17:34:06
 
 # 标准包导入
 import os
@@ -42,7 +42,7 @@ HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位
 
 pattern_1 = re.compile(r'^\d(\d*\.?\d*)+\d(%)?')
 pattern_2 = re.compile(r'^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\(\(][一二三四五六七八九十]+[\)\)]')
-pattern_3 = re.compile('^附录|^参考文献|^附表')
+pattern_3 = re.compile(r'^附录|^参考文献|^附表|附件[一二三四五六七八九十\d]+')
 
 
 def is_title(line: str) -> bool:

+ 3 - 3
celery_tasks/instance_locate.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-03 10:17:47
+# @Last Modified time: 2024-09-26 09:48:46
 from typing import List, Optional
 
 from celery_tasks.tools import filter_tables, filter_images, filter_content
@@ -28,7 +28,6 @@ def get_instances_by_title(title_list: List[dict], table_list: List[dict], image
     Returns:
         返回列表,包含标题,索引,起始页,终止页,相似度,表格列表,图片列表
     """
-
     title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
 
     title_filter = [i for i in title_sims]
@@ -45,7 +44,8 @@ def get_instances_by_title(title_list: List[dict], table_list: List[dict], image
 
         item['tables'] = filter_tables(table_list, item['page_number'], item['end_page'])
 
-        item['images'] = filter_images(image_list, item['page_number'], item['end_page'])
+        if image_list:
+            item['images'] = filter_images(image_list, item['page_number'], item['end_page'])
 
         if content_list:
             item['content'] = filter_content(content_list, item['page_number'], item['end_page'])

+ 39 - 8
celery_tasks/ocr_info.py

@@ -2,7 +2,8 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-18 10:19:07
+# @Last Modified time: 2024-09-27 14:08:30
+import re
 import json
 import pandas as pd
 
@@ -30,10 +31,7 @@ def parse_table(text):
     return table
 
 
-def get_ocr():
-    with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp:
-        raw = json.load(fp)
-
+def get_ocr(raw: dict, pretty: bool = False):
     nodes = []
     for node in raw['para_nodes']:
         if node['node_type'] == 'root':
@@ -47,12 +45,45 @@ def get_ocr():
     df['box'] = df['position'].apply(lambda x: x[0]['box'])
     del df['position']
     df.text = df.apply(lambda row: parse_table(row['text']) if row['node_type'] == 'table' else row['text'], axis=1)
-    df.to_json('ocr_demo.json', orient='records', lines=True, force_ascii=False)
-    return df
+
+    if not pretty:
+        return df
+
+    title = pd.DataFrame(df.query(''' node_type == 'title' ''').to_dict('records'))
+    title['title'] = title['text']
+    title['page_number'] = title['pageno']
+    title['level'] = title['para_type'].apply(lambda x: int(re.findall(r'\d+', x).pop()) if re.findall(r'\d+', x) else 99)
+
+    # 结果输出
+    outline = title.to_dict('records')
+
+    title['seq_num'] = title.index
+
+    # 结果输出
+    title = title.to_dict('records')
+
+    text_df = pd.DataFrame(df.query(''' node_type == 'text' ''').to_dict('records'))
+    content_data = text_df.groupby('pageno')['text'].apply(lambda x: '\n'.join(x)).reset_index()
+    content_data['page_number'] = content_data['pageno']
+
+    # 结果输出
+    contents = content_data.to_dict('records')
+
+    table_data = pd.DataFrame(df.query(''' node_type == 'table' ''').to_dict('records'))
+    table_data['table'] = table_data['text']
+    table_data['table_name'] = ''
+    table_data['page_numbers'] = table_data['pageno'].apply(lambda x: [x])
+
+    # 结果输出
+    tables = table_data.to_dict('records')
+
+    return {"title": title, "outline": outline, "contents": contents, "tables": tables, "images": []}
 
 
 if __name__ == '__main__':
-    raw = get_ocr()
+    with open('D:/Users/sprivacy/Documents/WeChat Files/wxid_uqa5354ji3ag22/FileStorage/File/2024-08/三峡左岸地坪商务标_合并_ocr.txt', 'r', encoding='utf-8') as fp:
+        raw = json.load(fp)
+    raw = get_ocr(raw)
 
     # for content in raw['file_content']:
     #     print(content.keys())

+ 22 - 10
celery_tasks/parser.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-09-18 10:19:47
+# @Last Modified time: 2024-09-19 17:37:49
 import os
 import base64
 import requests
@@ -31,9 +31,11 @@ def create_task(url, file_path, file_url):
         file_url: 文件链接
     Returns: 响应
     """
+    file = open(file_path, 'rb').read()
+
     # 文件请求
     body = {
-        "file": (os.path.basename(file_path), open(file_path, 'rb'), "multipart/form-data"),
+        "file": (os.path.basename(file_path), file, "multipart/form-data"),
     }
 
     # 文件链接请求
@@ -66,21 +68,31 @@ def query_task(url, task_id):
 
 
 if __name__ == '__main__':
+    import time
     import json
 
     # client_id = 'DFIQUMXb59oGUDkvGhTw15mE'
     # client_secret = 'F5LkFLo4TatiLcCcJgIXbJrv5Kw04Rf0'
     # token = main(client_id, client_secret)['access_token']
     token = "24.0ab90c2e2b750b61995052ab6b94f62c.2592000.1728805729.282335-86574608"
-    print(token)
-    # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
-    # file_path = "D:/desktop/三峡水利/data/projects/三峡左岸及地下电站地坪整治/评标报告/评标报告 .pdf"
-    # response = create_task(request_host, file_path, "")
-    # print(response)
+    # # print(token)
+    request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
+    file_path = "D:/desktop/三峡水利/celery.pdf"
+    response = create_task(request_host, file_path, "")
+    print(response)
+
+    time.sleep(10)
+
+    task_id = response['result']['task_id']
 
-    task_id = "task-nzNQul7cbiFRicw7IsF9dN9OE8QkZZ56"
     request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
     resp = query_task(request_host, task_id)
+
     print(resp)
-    # with open('评标报告.json', 'w', encoding='utf-8') as fp:
-    #     json.dump(resp, fp, indent=4, ensure_ascii=False)
+
+    url = resp['result']['parse_result_url']
+    response = requests.get(url)
+    response.encoding = 'utf-8'
+
+    with open('浙江国迈建设集团有限公司技术文件.json', 'w', encoding='utf-8') as fp:
+        json.dump(response.json(), fp, indent=4, ensure_ascii=False)

+ 1 - 0
pyproject.toml

@@ -18,6 +18,7 @@ scikit-learn = "1.1.1"
 celery = {extras = ["redis"], version = "^5.4.0"}
 jiagu = "^0.2.3"
 fuzzywuzzy = "^0.18.0"
+instructor = "^1.4.2"
 
 [[tool.poetry.source]]
 name = "tsinghua"