lfygithub01 8 месяцев назад
Родитель
Сommit
48a26e8342
5 измененных файлов с 795 добавлено и 106 удалено
  1. 158 0
      export_filed_recommendation.py
  2. 172 0
      extract_financial_report.py
  3. 19 8
      matcher.py
  4. 131 18
      parse_textmind_result.py
  5. 315 80
      textmind_ocr.py

+ 158 - 0
export_filed_recommendation.py

@@ -0,0 +1,158 @@
+'''
+专家领域推荐
+'''
+import re
+import torch
+import numpy as np
+from BaseTools import BaseMethods
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer, AutoModel
+
+
+class ExpertFiledRecommendation(BaseMethods):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
+        self.model = AutoModel.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
+        self.expert_datasets = self.base_data_load()
+        # print(self.expert_datasets.head(1))
+        self.export_filed_list = self.base_read('data/export_filed_commendation_datasets/export_fileds_dict.txt')
+        self.filed_keywords = self.json_read('data/export_filed_commendation_datasets/filed_keywords.json')
+        
+
+    def base_data_load(self):
+        expert_filed = self.pandas_read_xls(file_path='data/export_filed_commendation_datasets/专家专业数据_主要及相关行业.xlsx', sheetname='sheet1')
+        expert_filed['招标项目名称'] = expert_filed.apply(lambda row: self.projectname_processing(row['招标项目名称']), axis=1)
+        expert_filed['projectName_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['招标项目名称']), axis=1)
+        expert_filed['projectContent_embedding'] = expert_filed.apply(lambda row: self.get_embedding(row['项目内容']), axis=1)
+        return expert_filed
+
+    def projectname_processing(self, project_name:str):
+        ''' 数据处理 '''
+        project_name = project_name.split("竞争性谈判")[0]
+        project_name = re.sub(r'[\s\n]+|\.|','', project_name)
+        project_name = "".join(re.findall(r"[\u4e00-\u9fff]+", project_name))
+        return project_name
+
+    def get_embedding(self, text: str):
+        ''' 获取文本的embedding '''
+        text = re.sub(r'[\s\n]+|\.','', text)[:510]
+        print("get_embedding_text: ",text)
+        encoded_input = self.tokenizer(text, return_tensors='pt')
+        with torch.no_grad():
+            output = self.model(**encoded_input)
+        text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
+        return text_embedding
+    
+    def get_embeddings(self, text_list: list) -> list:
+        text_embeddings = []
+        for text in text_list:
+            encoded_input = self.tokenizer(text, return_tensors='pt')
+            with torch.no_grad():
+                output = self.model(**encoded_input)
+            text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
+        return text_embeddings
+
+    def Tok1(self, embedding1, embedding2):
+        ''' 相似度计算 '''
+        similarities = cosine_similarity([embedding1], [embedding2])[0][0]
+        
+        return similarities
+    
+    def sim_average(self, sim1, sim2):
+        ''' 相似度求均值 '''
+        sim = round((sim1 + sim2) / 2, 2)
+        return sim
+
+    def filed_recommendation(self, project_info: dict) -> list:
+        ''' export field recommendation '''
+        projectName = project_info['project_name']
+        content = re.sub("\n| ","",project_info['content'])
+        projectName_embedding = self.get_embedding(projectName)
+        projectContent_embedding = self.get_embedding(content)
+        self.expert_datasets['projectName_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectName_embedding'], projectName_embedding), axis=1)
+        # print(self.expert_datasets.head(1))
+        self.expert_datasets['content_sim'] = self.expert_datasets.apply(lambda row: self.Tok1(row['projectContent_embedding'], projectContent_embedding), axis=1)
+        # print(self.expert_datasets.head(1))
+        self.expert_datasets['sim'] = self.expert_datasets.apply(lambda row: self.sim_average(row['projectName_sim'], row['content_sim']), axis=1)
+        print(self.expert_datasets.head(2))
+
+        max_row = self.expert_datasets.nlargest(1, 'sim')
+        max_sim = max_row.iat[0,-1]
+        if max_sim>=0.65: 
+            result = max_row.to_dict('records')[0]
+            return {
+                "firstSpeciality": result['主要专业'], 
+                "secondSpeciality": result['相关专业'],
+                "financeSpeciality": "财务管理专业"
+            }
+        else:
+            result = []
+            if '保护' in projectName:
+                result.append("保护专业")
+            if '监测' in projectName or '监控' in projectName:
+                result.append("监控信息专业")
+            if len(result) > 1: 
+                return {
+                    "firstSpeciality": result[0], 
+                    "secondSpeciality": result[1],
+                    "financeSpeciality": "财务管理专业"
+                }
+            values_hits = {}
+            for k,v in self.filed_keywords.items():
+                hited_value = 0
+                for value in v:
+                    value_nums = content.count(value)
+                    hited_value += value_nums
+                values_hits[k] = hited_value
+            values_hits = sorted(values_hits.items(), key=lambda x:x[1], reverse=True)[:2]
+            if len(result) == 1:
+                return {
+                    "firstSpeciality": result[0], 
+                    "secondSpeciality": values_hits[0][1],
+                    "financeSpeciality": "财务管理专业"
+                }
+            return {
+                "firstSpeciality": values_hits[0][1], 
+                "secondSpeciality": values_hits[1][1],
+                "financeSpeciality": "财务管理专业"
+            }
+
+
+from fastapi import FastAPI
+import uvicorn
+from pydantic import BaseModel
+
+app = FastAPI()
+
+
+class ProjectInfo(BaseModel):
+    projectName: str
+    content: str
+
+class DemoResponse(BaseModel):
+    code: int = 0
+    msg: str=None
+    data: dict
+    ok: bool = False
+
+@app.post('/professional_recommendation')
+def professional_recommendation(info: ProjectInfo):
+    item_info = info.dict(exclude_unset=True)
+    if not item_info:
+        return {'code': 400, 'msg': '参数错误', 'data': None, 'ok': False}
+    print(info)
+    projectName = info.projectName
+    content = info.content
+    result = efr.filed_recommendation({'project_name': projectName, 'content': content})
+    if result:
+        print(result)
+        return {'code': 0, 'msg': '', 'data': result, 'ok': True}
+
+
+efr = ExpertFiledRecommendation()
+if __name__ == '__main__':
+    # efr = ExpertFiledRecommendation()
+    uvicorn.run(app='export_filed_recommendation:app', host='0.0.0.0', port=18884)  # 13888
+

+ 172 - 0
extract_financial_report.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+# @Author: privacy
+# @Date:   2024-06-11 13:43:14
+# @Last Modified by:   privacy
+# @Last Modified time: 2024-09-26 14:38:53
+import os
+import re
+
+from tqdm import tqdm
+from ocr import find_current_row
+from commonprocess import pic_ocr
+from instance_locate import get_instances_by_title
+
+
+def is_price(word: str) -> bool:
+    pattern = (
+        r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs"
+        r"|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|ден|RM|MT|lei|zł|USD|GBP|EUR|JPY"
+        r"|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR"
+        r"|PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED"
+        r"|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
+        r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)"
+    )
+    char_set = set('1234567890,.')
+    if re.fullmatch(pattern, word):
+        return True
+    elif sum([0 if s in char_set else 1 for s in word]) == 0:
+        return True
+    else:
+        return False
+
+
+def extract_financial_report(title_list: list, table_list: list, image_list: list, year: int) -> list:
+    """
+    财报解析
+
+    Args:
+        path:
+        title_list: 标题列表
+        table_list: 表格列表
+        image_list: 图片列表
+        year:       年份
+
+    Returns:
+        results
+    """
+
+    instances = get_instances_by_title(
+        title_list=title_list,
+        table_list=table_list,
+        image_list=image_list,
+        instances=[
+            '财务状况',
+            '近年财务状况表',
+            '{}年审计报告'.format(year - 1),
+            '{}年审计报告'.format(year - 2),
+            '{}年财务审计报告'.format(year - 1),
+            '{}年财务审计报告'.format(year - 2)
+        ]
+    )
+
+    print("instances: ", instances)   
+
+
+    # TODO 后续内容出现问题
+    # Wrong titles extracted at 2020 年度审计报告 
+    # Wrong titles extracted at 2021 年度审计报告 
+    # Wrong titles extracted at 附件二  近年财务状况表
+
+    results = []
+
+    for item in instances:
+        if item['page_number'] >= item['end_page']:
+        # if item['page_number'] > item['end_page']:
+            print('Wrong titles extracted at {}'.format(item['title']))
+        elif item['tables']:
+            # table_name = [t['table_name'] for t in item['tables']]
+            table_name = [t['table_name'] if t['table_name'] else item["title"] for t in item['tables']]
+            profits = []
+            for table in item['tables']:
+                profit = []
+                for row in table['table']:
+                    if list(filter(lambda x: re.match(r'.*利润.*', x) is not None, row)):
+                        profit.append(row)
+                profits.append(profit)
+            results.append({
+                'title': table_name,
+                'result': profits,
+                'pages': [i['page_numbers'] for i in item['tables']],
+                'chapter': item['title']
+            })
+        elif item.get('images'):
+            print('未找到表格 图片识别中')
+            print(item.get('images'))
+
+            pages = [
+                img['page_number'] for img in item.get('images')
+            ]
+
+            ocr_results = [
+                # pic_ocr.apply_async(kwargs={'image_path': img['image_name']}).get(timeout=30)['rawjson']['ret']
+                pic_ocr(image_path = img['image_name'])['rawjson']['ret']
+                for img in item.get('images')
+            ]
+
+            candidate = []
+            rows = []
+            print('结果分析中')
+
+            for i, ret in tqdm(enumerate(ocr_results)):
+                for res in ret:
+                    if re.match(r'.*(净利润).*', res['word']) is not None:
+                    # if re.match(r'.*(利润).*', res['word']) is not None:
+                        top = res['rect']['top']
+                        bottom = res['rect']['top'] - res['rect']['height']
+                        candidate.append(
+                            {
+                                'page': pages[i],
+                                'text': res['word'],
+                                'top': top,
+                                'bottom': bottom,
+                            }
+                        )
+                        rows.append(find_current_row(ret, top, bottom))
+            for it in candidate:
+                print('定位:\t{}\t定位词:\t{}'.format(it['page'], it['text']))
+
+            for i, row in enumerate(rows):
+                title = []
+                profits = []
+                for w in row:
+                    if is_price(w['word']):
+                        profits.append(w['word'])
+                    else:
+                        title.append(w['word'])
+                if title and profits:
+                    results.append({
+                        'chapter': item['title'],
+                        'page': candidate[i]['page'],
+                        'title': title,
+                        'result': profits
+                    })
+
+    return results
+
+
+if __name__ == '__main__':
+    pass
+    # import json
+    # import datetime
+    # from settings import title_n_path, table_list_path, image_path
+
+    # with open(title_n_path, 'r', encoding='utf-8') as fp:
+    #     title_list = json.load(fp)
+
+    # with open(table_list_path, 'r', encoding='utf-8') as fp:
+    #     table_list = json.load(fp)
+
+    # with open(image_path, 'r', encoding='utf-8') as fp:
+    #     image_list = json.load(fp)
+
+    # y = datetime.datetime.now().year
+
+    # print(
+    #     extract_financial_report(
+    #         title_list=title_list,
+    #         table_list=table_list,
+    #         image_list=image_list,
+    #         year=2022
+    #     )
+    # )
+    

+ 19 - 8
matcher.py

@@ -2,7 +2,11 @@
 # @Author: privacy
 # @Date:   2024-06-27 09:33:01
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-06-27 14:44:43
+# @Last Modified time: 2024-08-23 12:10:09
+import os
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+os.environ['HF_DATASETS_OFFLINE'] = '1'
+
 import torch
 import numpy as np
 import pandas as pd
@@ -14,9 +18,12 @@ class Matcher:
     def __init__(self):
         # Load model directly
         # # # 加载预训练的text2vec模型和分词器
-        self.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
-        self.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
+        # self.tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-base-chinese")
+        # self.model = AutoModel.from_pretrained("GanymedeNil/text2vec-base-chinese")
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
+        self.model = AutoModel.from_pretrained(pretrained_model_name_or_path='code/model/models--GanymedeNil--text2vec-base-chinese/snapshots/f13ec0b6396814e1352f3d30fe80bb7079625777')
 
+    
     def TopK1(self, title: str, keywords: list, query_embedding, option_embeddings: list) -> pd.Series:
         # 计算相似度
         similarities = [cosine_similarity([query_embedding], [embedding])[0][0] for embedding in option_embeddings]
@@ -24,23 +31,27 @@ class Matcher:
         # 找到最相近的关键词
         most_similar_keyword = keywords[similarities.index(max(similarities))]
     
-        print(f"和 {title} 最相近的关键词是:{most_similar_keyword}")
+        # print(f"和 {title} 最相近的关键词是:{most_similar_keyword}")
     
         return pd.Series([most_similar_keyword, max(similarities)])
 
     def get_embedding(self, text: str):
-        encoded_input = tokenizer(text, return_tensors='pt')
+        encoded_input = self.tokenizer(text, return_tensors='pt',truncation=True,padding=True,max_length=512)
         with torch.no_grad():
-            output = model(**encoded_input)
+            try:
+                output = self.model(**encoded_input)
+            except:
+                print(encoded_input['input_ids'].size())
+                raise ValueError(text)
         text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
         return text_embedding
     
     def get_embeddings(self, text_list: list) -> list:
         text_embeddings = []
         for text in text_list:
-            encoded_input = tokenizer(text, return_tensors='pt')
+            encoded_input = self.tokenizer(text, return_tensors='pt')
             with torch.no_grad():
-                output = model(**encoded_input)
+                output = self.model(**encoded_input)
             text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
         return text_embeddings
 

+ 131 - 18
parse_textmind_result.py

@@ -36,34 +36,74 @@ def paese_content(layouts:list):
         return pd.NA
     contents = []
     for layout in layouts:
-        if layout['sub_type'] != 'table' or layout['sub_type'] != 'image' or layout['sub_type'] != 'seal':
+        if layout['type'] != 'table' or layout['type'] != 'image' or layout['type'] != 'seal' or layout['type'] != 'head_tail':
+            if not layout['text']: continue
             contents.append(layout['text'])
-    return "".join(contents).replace('\n\n','\n').replace(' ','')
+    return "\n".join(contents).replace('\n\n','\n').replace(' ','')
 
-def parse_table_name(tables:list, images:list, layouts:list):
+def parse_table_name(tables:list, layouts:list):
     '''  '''
+
     if not tables:
         return pd.NA
-    table_names = []
+    
+    node_dict = {}
     for layout in layouts:
-        if layout['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
-            table_names.append(re.sub("\n| ","",layout['text']))
-    for image in images:
-        for content_layouts in image['content_layouts']:
-            if content_layouts['sub_type'] == 'table_title' or layout['sub_type'] == 'head_tail':
-                table_names.append(re.sub("\n| ","",content_layouts['text']))
+        if not layout['children']: continue
+        node_dict[layout['text']] = layout['children']   # text对应children 一一对应
+    
+    table_ids = []
+    for table in tables:
+        table_ids.append({'layout_id':table['layout_id']})
     
+    table_names = []
+    for table_id in table_ids:
+        layout_id = table_id['layout_id']
+        for text, children in node_dict.items():
+            if layout_id in children:
+                table_names.append(text)
+
+    if not table_names:
+        layout_ids = []
+        for layout in layouts:
+            layout_ids.append({layout['layout_id']:layout['text']}) 
+        
+        table_layout_ids = []
+        for table in tables:
+            table_layout_ids.append({'layout_id':table['layout_id']})
+        
+        index_ = 0
+        for table_layout_id in table_layout_ids:
+            for layout_id in layout_ids:
+                if table_layout_id['layout_id'] in layout_id:
+                    index_ = layout_ids.index(layout_id)
+                    break
+        
+        for ids in layout_ids[:index_]:
+            for value in ids.values():
+                if '表' in value: table_names.append(value)
+        
+        if not table_names and index_ > 0:
+            table_names.append(list(layout_ids[index_-1].values())[0])
+
+
     return ";".join(table_names)
                 
 def parse_title(layouts:list):
         ''' 解析标题 '''
         if not layouts: return pd.NA
         for layout in layouts:
-            if layout['type'] == 'title':
-                return re.sub("\n","",layout['text'])
-        for layout in layouts:
-            if layout['text']:
-                return re.sub("\n","",layouts[0]['text']) if len(layouts[0]['text']) < 15 else pd.NA
+            if (layout['type'] == 'title' or 'title' in layout['sub_type']) and layout['text'] and layout['type'] != 'head_tail':
+                text = re.sub("\n","",layout['text'])
+                if not text: continue
+                return text
+        for layout in layouts: 
+            if not (layout['type'] == 'text' and layout['text']): continue
+            text = re.sub("\n","",layout['text'])
+            if text and len(text) < 30:
+                return re.sub("\n","",text)
+        return pd.NA
+
 def parse_table(markdown:str):
     table = []
     lines = markdown.split('\n')
@@ -95,6 +135,7 @@ def get_ocr_new(raw:dict, pretty: bool = False):
     title_df = title_df.rename(columns={'page_num':'page_number'})
     title_df['title'] = df['layouts'].apply(lambda x: parse_title(x))
     title_df['box'] = df['layouts'].apply(lambda x: x[0]['position'] if x else pd.NA)
+    # title_df['box'] = df[df['layouts'].apply(lambda x: x[0]['position'] if x else False)]
     title_df['node_type'] = df['layouts'].apply(lambda x: x[0]['type'] if x else pd.NA)
     title_df['para_type'] = df['layouts'].apply(lambda x: x[0]['sub_type'] if x else pd.NA)
     title_df['text'] = title_df['title']
@@ -112,7 +153,7 @@ def get_ocr_new(raw:dict, pretty: bool = False):
     table_df['page_num'] = table_df['page_num'].apply(lambda x: [x])
     table_df = table_df.rename(columns={'page_num':'page_numbers'})
     table_df['table'] = df['tables'].apply(lambda x: parse_table(x[0]['markdown']) if x else pd.NA)
-    table_df['table_name'] = df.apply(lambda x: parse_table_name(x['tables'], x['images'], x['layouts']), axis=1)
+    table_df['table_name'] = df.apply(lambda x: parse_table_name(x['tables'], x['layouts']), axis=1)
     table_df.dropna(inplace=True)
 
     table = table_df.to_dict('records')
@@ -122,7 +163,9 @@ def get_ocr_new(raw:dict, pretty: bool = False):
     return {"title": title, "outline": outline, "contents": content, "tables": table, "images": []}
  
 
-if __name__ == '__main__':
+
+
+def run():
     basepath = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/30份数据整理'
     for save_file in os.listdir(basepath):
         save_file_path = os.path.join(basepath, save_file)
@@ -130,9 +173,11 @@ if __name__ == '__main__':
             if '投标文件' == save_file_name:
                 save_file_name_path = os.path.join(save_file_path,save_file_name)
                 textmind_save_dir = os.path.join(save_file_name_path,'textmind')
+                if not os.path.exists(textmind_save_dir): continue
                 for bidder_name in os.listdir(textmind_save_dir):
-                    if bidder_name[-13:] != 'textmind.json': continue
+                    if 'textmind.json' not in bidder_name[-13:]: continue
                     textmind_result_path = os.path.join(textmind_save_dir, bidder_name)
+                    print("textmind_result_path ",textmind_result_path)
                     with open(textmind_result_path, 'r', encoding='utf-8') as fp:
                         raw = json.load(fp)
                         try:
@@ -153,3 +198,71 @@ if __name__ == '__main__':
                         except:
                             print(textmind_result_path)
                             raise ValueError("stop")
+
+
+def parse_datasets():
+    base_dir = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
+    # pre_parse_datasets = []
+
+    for base_folders in os.listdir(base_dir):
+        base_folder = os.path.join(base_dir, base_folders)
+        folder_info = {}
+        for folders in os.listdir(base_folder):
+            folder = os.path.join(base_folder, folders)
+            if folders == "招标文件":
+                for file in os.listdir(folder):
+                    if file.endswith(".pdf"):
+                        projectName = file.split(".")[0] # 去掉后缀之后的文件名
+                        tender_file = os.path.join(folder, file)
+
+                        # folder_info["projectName"] = projectName
+                        # folder_info["buyFile"] = tender_file
+                        
+            elif folders == '投标文件':
+                # folder_info["bidder_info"] = []
+                print("folder:", folder)
+                for file in os.listdir(folder):
+                    # if file.endswith(".pdf"):
+                    #     bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
+                    #     bidder_file = os.path.join(folder, file)
+
+                    #     folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
+                    if file == 'textmind':
+                        textmind_result_path = os.path.join(folder, file)
+                        for textmind_json in os.listdir(textmind_result_path):
+                            if '_textmind' not in textmind_json: continue
+                            bidderUnit = textmind_json.split("_")[0] # _textmind.json
+                            textmind_file_path = os.path.join(textmind_result_path, textmind_json)
+                            with open(textmind_file_path, 'r', encoding='utf-8') as fp:
+                                raw = json.load(fp)
+                                try:
+                                    raw = get_ocr_new(raw=raw, pretty=True)
+                                    for k, v in raw.items():
+                                        if k == 'title':
+                                            with open(f'{textmind_result_path}/{bidderUnit}_bidding_title.json', 'w', encoding='utf-8') as fo:
+                                                json.dump(v, fo, ensure_ascii=False)
+                                        elif k == 'outline':
+                                            with open(f'{textmind_result_path}/{bidderUnit}_bidding_outlines.json', 'w', encoding='utf-8') as fo:
+                                                json.dump(v, fo, ensure_ascii=False)
+                                        elif k == 'contents':
+                                            with open(f'{textmind_result_path}/{bidderUnit}_bidding_content.json', 'w', encoding='utf-8') as fo:
+                                                json.dump(v, fo, ensure_ascii=False)
+                                        elif k == 'tables':
+                                            with open(f'{textmind_result_path}/{bidderUnit}_bidding_tables.json', 'w', encoding='utf-8') as fo:
+                                                json.dump(v, fo, ensure_ascii=False)
+                                except:
+                                    print(textmind_result_path)
+                                    raise ValueError("stop")
+
+        # pre_parse_datasets.append(folder_info)
+
+    # 提前循环遍历建立保存文件夹内容
+    # pre_parse_datasets
+
+
+if __name__ == '__main__':
+    pass
+    run()
+
+    # parse_datasets()
+

+ 315 - 80
textmind_ocr.py

@@ -1,28 +1,131 @@
 
 import requests, os, time, json, base64
+import tqdm, re
+'''bos_sample_conf'''
+from baidubce.bce_client_configuration import BceClientConfiguration
+from baidubce.auth.bce_credentials import BceCredentials
+'''bos'''
+import sys
+import json
+import numpy as np
+from baidubce.services.bos import bos_handler
+from baidubce.services.bos import storage_class
+from baidubce.services.bos import canned_acl
+from baidubce.bce_client_configuration import BceClientConfiguration
+from baidubce.auth.bce_credentials import BceCredentials
+#导入BOS相关模块
+from baidubce import exception
+from baidubce.services import bos
+from baidubce.services.bos import canned_acl
+from baidubce.services.bos.bos_client import BosClient
 
-def create_task(url, file_path, file_url):
-    """
-    Args:
-        url: string, 服务请求链接
-        file_path: 本地文件路径
-        file_url: 文件链接
-    Returns: 响应
-    """
-    file = open(file_path, 'rb').read()
 
-    # 文件请求
-    body = {
-        "file": (os.path.basename(file_path), file, "multipart/form-data"),
-    }
+'''bos_sample_conf'''
+#设置BosClient的Host,Access Key ID和Secret Access Key
+bos_host = "bj.bcebos.com"
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
 
-    data = {
-        "file_name": os.path.basename(file_path),
-        "return_para_nodes": True
-    }
+access_key_id = "87815919190940dd9ff8a7790281e1e9"
+secret_access_key = "8ac48b64cfe94fd4b4be72a26a48270d"
 
-    response = requests.post(url, data=data, files=body)
-    return response.json()
+access_key_id = "ALTAKEq9L0oxxxDi5jUc3e12gu"
+secret_access_key = "9336a04f88e845e284bab26bd5fd8182"
+
+# 创建BceClientConfiguration
+config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint = bos_host)
+
+'''bos'''
+bos_client = BosClient(config)
+"""
+response = bos_client.list_buckets()
+for bucket in response.buckets:
+     print (bucket.name)
+"""
+#根据ListObjects接口来获取图片的key,prefix为前缀
+def get_objects(prefix, max_keys=10):
+     objects = bos_client.list_objects('ocrtrips', max_keys=max_keys, prefix=prefix)
+     return objects.contents
+
+#上传
+def put_bos(object_key, file_name, bucket_name='ctrimgs'):
+     bos_client.put_object_from_file(bucket_name, object_key, file_name)
+     return 'https://ctrimgs.bj.bcebos.com/' + object_key
+     #return bos_client.put_object_from_file(bucket_name, object_key, file_name)
+#删除
+def delete_bos(object_key, bucket_name='ctrimgs'):
+    bos_client.delete_object(bucket_name, object_key)
+    return ''
+
+    
+#下载
+def get_bos(bucket_name, object_key, file_name):
+     bos_client.get_object_to_file(bucket_name,
+                                  object_key,
+                                  file_name)
+#bos查询
+def get_object_lists(buckent_name, prefix, max_keys=10):
+    objects = bos_client.list_objects(buckent_name, max_keys=max_keys, prefix=prefix)
+    return objects.contents
+#分块上传 文件大于5G
+def get_multipart(bucket_name, object_key, file_name):
+
+    upload_id = bos_client.initiate_multipart_upload(bucket_name, object_key).upload_id
+
+    left_size = os.path.getsize(file_name)
+    #设置分块的开始偏移位置
+    offset = 0
+
+    part_number = 1
+    part_list = []
+
+    while left_size > 0:
+        #设置每块为5MB
+        part_size = 5 * 1024 * 1024
+        if left_size < part_size:
+            part_size = left_size
+
+        response = bos_client.upload_part_from_file(
+            bucket_name, object_key, upload_id, part_number, part_size, file_name, offset)
+
+
+        left_size -= part_size
+        offset += part_size
+        part_list.append({
+            "partNumber": part_number,
+            "eTag": response.metadata.etag
+        })
+
+
+        part_number += 1
+
+    bos_client.complete_multipart_upload(bucket_name, object_key, upload_id, part_list)
+
+
+
+'''textmind_ocr'''
+# def create_task(url, file_path, file_url):
+#     """
+#     Args:
+#         url: string, 服务请求链接
+#         file_path: 本地文件路径
+#         file_url: 文件链接
+#     Returns: 响应
+#     """
+#     file = open(file_path, 'rb').read()
+
+#     # 文件请求
+#     body = {
+#         "file": (os.path.basename(file_path), file, "multipart/form-data"),
+#     }
+
+#     data = {
+#         "file_name": os.path.basename(file_path),
+#         "return_para_nodes": True
+#     }
+
+#     response = requests.post(url, data=data, files=body)
+#     return response.json()
 
 def create_task_1(url, file_path, file_url):
     """
@@ -35,11 +138,17 @@ def create_task_1(url, file_path, file_url):
    # 文件请求
     with open(file_path, "rb") as f:
         file_data = base64.b64encode(f.read())
-    data = {
-        "file_data": file_data,
-        "file_url": file_url,
-        "file_name": os.path.basename(file_path)
-    }
+    if file_url:
+        data = {
+                "file_url": file_url,
+                "file_name": os.path.basename(file_path)
+            }
+    else:
+        data = {
+            "file_data": file_data,
+            "file_url": file_url,
+            "file_name": os.path.basename(file_path)
+        }
     
     # 文档切分参数,非必传
     # return_doc_chunks = json.dumps({"switch": True, "chunk_size": -1})
@@ -51,19 +160,19 @@ def create_task_1(url, file_path, file_url):
     return response.json()
 
 
-def query_task(url, task_id):
-    """
-    Args:
-        url: string, 请求链接
-        task_id: string, task id
-    Returns: 响应
-    """
-    data = {
-        "task_id": task_id
-    }
+# def query_task(url, task_id):
+#     """
+#     Args:
+#         url: string, 请求链接
+#         task_id: string, task id
+#     Returns: 响应
+#     """
+#     data = {
+#         "task_id": task_id
+#     }
 
-    response = requests.post(url, data=data, files=data)
-    return response.json()
+#     response = requests.post(url, data=data, files=data)
+#     return response.json()
 
 def query_task_1(url, task_id):
     """
@@ -80,54 +189,180 @@ def query_task_1(url, task_id):
     response = requests.post(url, headers=headers, data=data)
     return response.json()
 
-def request1(bidderFile,nums:int=1):
-    try:
-        response = create_task_1(request_host, bidderFile, "")
-        print('res1  :',response)
-        task_id = response['result']['task_id']
-        if not task_id: raise ValueError('task_id is None')
-    except Exception as e:
-        print("request1 :",e)
-        time.sleep(10)
-        nums += 1
-        if nums > 100: return 
-        task_id = request1(bidderFile, nums)
-    return task_id
-    
-def request2(task_id,nums:int=1):
-    try:
-        resp = query_task_1(request_query_host, task_id)
-        print('res2  :',resp)
-        url = resp['result']['parse_result_url']
-        response = requests.get(url)
-        response.encoding = 'utf-8'
-        response.json()
-    except Exception as e:
-        print("request2 :",e)
-        time.sleep(20)
-        nums += 1
-        if nums > 500: return 
-        response = request2(task_id,nums)
-    return response
-
-
-token = "24.87693e5dd8c2d7d7accf260bb2d265d2.2592000.1733970962.282335-86574608"
+def request1(bidderFile, bidderUrl:str = '', nums:int = 1, max_nums:int = 50):
+    while nums < max_nums:
+        try:
+            response = create_task_1(request_host, bidderFile, bidderUrl)
+            print('res1  :',response)
+            task_id = response['result'].get('task_id', None)
+            if not task_id: raise ValueError('task_id is None')
+            return task_id
+        except Exception as e:
+            print("request1 :",e)
+            nums += 1
+            time.sleep(10)
+            
+
+def request2(task_id, nums:int = 1, max_nums: int = 500):
+    while nums < max_nums:
+        try:
+            resp = query_task_1(request_query_host, task_id)
+            print('res2  :', resp)
+            if resp['result']['status'] == 'success':
+                url = resp['result']['parse_result_url']
+                # url = resp['result']['markdown_url']  # 取markdown   return TXT
+                response = requests.get(url)
+                response.encoding = 'utf-8'
+                response.json()
+                return response
+        except Exception:
+            nums += 1
+            time.sleep(20)
+
+# def request2(task_id,nums:int=1):
+#     try:
+#         resp = query_task_1(request_query_host, task_id)
+#         print('res2  :',resp)
+#         url = resp['result']['parse_result_url']
+#         response = requests.get(url)
+#         response.encoding = 'utf-8'
+#         response.json()
+#     except Exception as e:
+#         print("request2 :",e)
+#         time.sleep(20)
+#         nums += 1
+#         if nums > 500: return 
+#         response = request2(task_id,nums)
+#     return response
+
+
+token = "24.8dc8595999193e140449656989204d61.2592000.1736062425.282335-86574608"
 # request_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task?access_token={token}"
 request_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task?access_token={token}"  # 更新
 # request_query_host = f"https://aip.baidubce.com/file/2.0/brain/online/v1/parser/task/query?access_token={token}"
 request_query_host = f"https://aip.baidubce.com/rest/2.0/brain/online/v2/parser/task/query?access_token={token}"  # 更新
-# 测试pdf文件
-# file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
-file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
 
-# time.sleep(5)
-task_id = request1(file_path)
-print('1   :',task_id)
+def test():
+    # 测试pdf文件
+    # file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/结果测试数据/1-监控系统自主可控大型PLC适配研究_中控技术产品_采购程序文件_2/投标文件/北京中天华拓工程技术有限公司-投标文件.pdf"
+    # file_path = "data/zhaocai_datasets/30份数据整理/1-2021_2022年三峡电站左岸厂房中央空调系统主机设备改造/投标文件/广东申菱环境系统股份有限公司.pdf"
+    file_path = "/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/北京华科同安监控技术有限公司.pdf"
+
+    # time.sleep(5)
+    task_id = request1(file_path)
+    print('1   :',task_id)
+
+    time.sleep(10)
+    response = request2(task_id)
+    # print('2 file_name :',response.json()['file_name'])
+
+    # 保存textmind解析结果
+    # with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.json', 'w', encoding='utf-8') as fp:
+    #     json.dump(response.json(), fp, indent=4, ensure_ascii=False)
+    with open('data/预审查数据/textmind_result/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究_北京华科同安监控技术有限公司.md', 'w', encoding='utf-8') as fp:
+        fp.write(response.text())
+
+# test()
+
+
+def parse_pdf():
+    base_dir = r'/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4'
+    save_dir = 'data/预审查数据/20241122-4/ocr_result'
+    os.makedirs(save_dir, exist_ok=True)
+
+    pre_parse_datasets = []
+
+    # 遍历base_dir下所有文件
+    for base_folders in os.listdir(base_dir):
+        base_folder = os.path.join(base_dir, base_folders)
+        folder_info = {}
+        for folders in os.listdir(base_folder):
+            folder = os.path.join(base_folder, folders)
+            if folders == "招标文件":
+                for file in os.listdir(folder):
+                    if file.endswith(".pdf"):
+                        projectName = file.split(".")[0] # 去掉后缀之后的文件名
+                        tender_file = os.path.join(folder, file)
+
+                        folder_info["projectName"] = projectName
+                        folder_info["buyFile"] = tender_file
+                        
+            elif folders == '投标文件':
+                folder_info["bidder_info"] = []
+                for file in os.listdir(folder):
+                    if file.endswith(".pdf"):
+                        bidderUnit = file.split(".")[0] # 去掉后缀之后的文件名
+                        bidder_file = os.path.join(folder, file)
+
+                        folder_info["bidder_info"].append({"bidderUnit":bidderUnit, "bidderFile":bidder_file})
+
+        pre_parse_datasets.append(folder_info)
+        # break
+
+    # pre_parse_datasets = parse_pdf()
+    # print(pre_parse_datasets)
+
+    # 开始解析pdf
+    for pre_parse_dataset in pre_parse_datasets:
+        bidder_info = pre_parse_dataset['bidder_info']
+        projectName = pre_parse_dataset['projectName']
+        buyFile = pre_parse_dataset['buyFile']
+        for bidder_firm in bidder_info:
+            bidderFile = bidder_firm['bidderFile']
+            bidderUnit = bidder_firm['bidderUnit']
+            task_id = request1(bidderFile)
+            response = request2(task_id)
+            with open(f"{save_dir}/{buyFile}_1_{bidderUnit}_textmind.json", 'w', encoding='utf-8') as fp:
+                json.dump(response.json(), fp, indent=4, ensure_ascii=False)
+
+    return pre_parse_datasets
+
+
+    
+def picture_ocr(image_path:str):
+    ''' 单个图片OCR结果 '''
+    task_id = request1(image_path)
+    response = request2(task_id)
+    save_file_path = "_".join(image_path[:-4].split('/')[-3:])
+    print(save_file_path)
+    with open(f"data/预审查数据/download/{save_file_path}_textmind.json", 'w', encoding='utf-8') as fp:
+        json.dump(response.json(), fp, indent=4, ensure_ascii=False)
+# picture_ocr('/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4测试数据/水车室复杂高危作业环境的模块化集成检修装备研制/中国科学院沈阳自动化研究所/scanned/page-134.jpg')
+
+
+def parse_single_file(file_path:str, save_dir:str):
+    '''
+    parse single file(> 50M)
+    '''
+    def get_FileSize(filePath):
+        fsize = os.path.getsize(filePath)
+        fsize = fsize/float(1024*1024)
+        return round(fsize, 2)
+    
+    
+    
+    file_name = os.path.basename(file_path)
+    file_name = re.sub('\040', '', file_name)
+
+    # if file_name:
+    #     delete_bos(object_key=file_name)
+
+    if get_FileSize(file_path) > 49:
+        print('file_size > 50M')
+        file_url = put_bos(object_key=file_name, file_name=file_path)
+        print(file_url)
+    
+    task_id = request1(file_path, file_url)
+    response = request2(task_id)
+    
+    if file_name:
+        delete_bos(object_key=file_name)
+
+    save_file_path = os.path.join(save_dir, file_name[:-4])
 
-time.sleep(10)
-response = request2(task_id)
-print('2 file_name :',response.json()['file_name'])
+    with open(f'{save_file_path}_textmind.json', 'w', encoding='utf-8') as fp:
+        fp.write(response.json(), fp, indent=4, ensure_ascii=False)
 
-# 保存textmind解析结果
-with open('data/预审查数据/textmind_result/2021_2022年三峡电站左岸厂房中央空调系统主机设备改造_广东申菱环境系统股份有限公司.json', 'w', encoding='utf-8') as fp:
-    json.dump(response.json(), fp, indent=4, ensure_ascii=False)
+file_path = '/mnt/d/Work_PWS/24y_04_SanXia/data/甲方提供材料/20241122-4/基于大数据驱动的水电站辅助设备在线监测与预警诊断研究/投标文件/河海大学.pdf'
+save_path = 'data/预审查数据/download'
+# parse_single_file(file_path, save_path)