sprivacy преди 11 месеца
родител
ревизия
b66967d53c
променени са 11 файла, в които са добавени 619 реда и са изтрити 301 реда
  1. 156 0
      README.md
  2. 158 190
      document_.py
  3. 26 9
      extract_financial_report.py
  4. 38 12
      extract_price.py
  5. 73 5
      get_info.py
  6. 36 22
      instance_locate.py
  7. 9 5
      matcher.py
  8. 3 2
      ocr_api.py
  9. 47 19
      project_loc.py
  10. 3 1
      requirements.txt
  11. 70 36
      text_extractor.py

+ 156 - 0
README.md

@@ -1,6 +1,155 @@
+服务架构
+---
+
+![image-20240826152353199](image-20240826152353199.png)
+
+流程结构
+---
+
+```mermaid
+graph LR
+    工厂[工厂] --> 创建工厂
+    创建工厂 --> Celery实例
+    创建工厂 --> Flask实例
+    Celery实例 --> Redis中间件
+    Celery实例 --> Worker1
+    Celery实例 --> Worker2
+    Celery实例 --> Worker3
+    Celery实例 --> Worker4
+    Celery实例 --> Worker5
+    Flask实例 --> submit
+    submit --> Redis中间件
+    Redis中间件 --> Worker1
+    Redis中间件 --> Worker2
+    Redis中间件 --> Worker3
+    Redis中间件 --> Worker4
+    Redis中间件 --> Worker5
+    Worker1 --> ALGNODE1
+    Worker2 --> ALGNODE2
+    Worker3 --> ALGNODE3
+    Worker4 --> ALGNODE4
+    Worker5 --> ALGNODE5
+```
+
+
+```mermaid
+graph LR
+    ALGNODE[ALGNODE] --> 通用PDF抽取模块
+    通用PDF抽取模块 --> 图片解析
+    通用PDF抽取模块 --> 表格解析
+    通用PDF抽取模块 --> 正文解析
+    通用PDF抽取模块 --> 标题解析
+    图片解析 --> 图片OCR文本
+    表格解析 --> 表格内容
+    正文解析 --> 正文内容
+    标题解析 --> 标题内容
+    图片OCR文本 --> 通用数据回收
+    表格内容 --> 通用数据回收
+    正文内容 --> 通用数据回收
+    标题内容 --> 通用数据回收
+    通用数据回收 --> 采购数据
+    通用数据回收 --> 投标数据
+    采购数据 --> 预审
+    投标数据 --> 预审
+    采购数据 --> 初审
+    投标数据 --> 初审
+    采购数据 --> 详审
+    投标数据 --> 详审
+    采购数据 --> 报价评审
+    投标数据 --> 报价评审
+    预审 --> 回传
+    初审 --> 回传
+    详审 --> 回传
+    报价评审 --> 回传
+```
+
+代码层
+---
+
+### 异步配置文件(backend/config.py)
+
+```python
+class Config:
+    CELERY = dict(
+        CELERY_BROKER_URL = 'redis://localhost:6379/0',
+        CELERY_RESULT_BACKEND = 'redis://localhost:6379/0',
+        include = "backend.celery_task",
+        task_ignore_result = True,
+        timezone = 'Asia/Shanghai',
+        enable_utc = False,
+        task_track_started = True
+    )
+```
+
+### 创建工厂(backend/__init__.py)
+
+```python
+from flask import Flask
+from celery import Celery, Task
+
+celery_app = Celery(__name__)
+
+def create_app(test_config: dict = None) -> Flask:
+    app = Flask(__name__)
+
+    class FlaskTask(Task):
+        def __call__(self, *args: object, **kwargs: object) -> object:
+            with app.app_context():
+                return self.run(*args, **kwargs)
+
+    if test_config is None:
+        app.config.from_pyfile('config.py', silent=True)
+    else:
+        app.config.from_mapping(test_config)
+
+    celery_app.config_from_object(app.config['CELERY'])
+    celery_app.Task = FlaskTask
+    celery_app.set_default()
+
+    return app
+```
+
+### 封包脚本(make_celery.py)
+
+```python
+from backend import (
+    create_app,
+    celery_app,
+)
+
+app = create_app()
+```
+
+### WEB配置 (gunicorn_config.py)
+
+```python
+workers = 1
+bind = "0.0.0.0:8000"
+backlog = 2048
+loglevel = "INFO"
+daemon = True
+pidfile = "/var/run/backend.pid"
+accesslog = "/var/log/backend/access.log"
+errorlog = "/var/log/backend/error.log"
+```
+
+### 执行脚本(run.sh)
+
+```bash
+gunicorn --config gunicorn_config.py make_celery
+celery multi start worker -A make_celery:celery_app -P prefork -E --loglevel=INFO --logfile=/var/log/celery/%n%I.log --pidfile=/run/celery/%n.pid
+celery -A make_celery:celery_app events
+```
+
+
+
+
+
 
 
 主要模块描述
+---
+
 1、tools     大纲解析模块
 2、get_info  PDF信息抽取模块
 3、matcher   段落定位模块
@@ -11,6 +160,13 @@
 7、LLMAgent      大模型调用模块
 8、document_     招标文件解析模块
 
+9、extract_price 报价抽取
+
+10、doc2pdf      word文档转pdf模块
+
+11、extract_financial_report 财报抽取
+
+12、ocr_api      图片OCR类
 
 ##### PDF中无边框表格内容抽取
 ```

+ 158 - 190
document_.py

@@ -25,50 +25,47 @@ chinese_num_map = {
     '十': 10
 } 
 
-def create_logger(log_path):
-    """
-    将日志输出到日志文件和控制台
-    """
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    formatter = logging.Formatter(
-        '%(asctime)s - %(levelname)s - %(message)s')
-
-    # 创建一个handler,用于写入日志文件
-    file_handler = logging.FileHandler(
-        filename=log_path, mode='w')
-    file_handler.setFormatter(formatter)
-    file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
-
-    # 创建一个handler,用于将日志输出到控制台
-    console = logging.StreamHandler()
-    console.setLevel(logging.DEBUG)
-    console.setFormatter(formatter)
-    logger.addHandler(console)
-
-    return logger
-
-log_path = "code/logs/logs.log" 
-logger = create_logger(log_path=log_path)
-
-class DocumentPreReview():
-    def __init__(self, file_path) -> None:
+# def create_logger(log_path):
+#     """
+#     将日志输出到日志文件和控制台
+#     """
+#     logger = logging.getLogger()
+#     logger.setLevel(logging.INFO)
+
+#     formatter = logging.Formatter(
+#         '%(asctime)s - %(levelname)s - %(message)s')
+
+#     # 创建一个handler,用于写入日志文件
+#     file_handler = logging.FileHandler(
+#         filename=log_path, mode='w')
+#     file_handler.setFormatter(formatter)
+#     file_handler.setLevel(logging.INFO)
+#     logger.addHandler(file_handler)
+
+#     # 创建一个handler,用于将日志输出到控制台
+#     console = logging.StreamHandler()
+#     console.setLevel(logging.DEBUG)
+#     console.setFormatter(formatter)
+#     logger.addHandler(console)
+
+#     return logger
+
+# log_path = "./logs.log" 
+# logger = create_logger(log_path=log_path)
+
+class DocumentPreReview:
+    def __init__(self) -> None:
         self.bm = BaseMethods()
-        self.Bidding_tables = self.get_Bidding_table(file_path)
-    
-    
+
     def get_Bidding_table(self, file_path:str):
         ''' get table data
         '''
         # file_path = "data/预审查数据/三峡左岸及电源电站中央空调系统管网及末端改造(发布稿)-table.json"
         # file_path = "data/预审查数据/2023年档案管理系统功能优化项目采购程序文件-table.json"
         all_tables = self.bm.json_read(file_path)
+        self.Bidding_tables = all_tables
         return all_tables
-    
-    
-    
+
     def _scrutinize_judge(self, tag:str, threshold_value:int=3):
         ''' Clause number content judgment 
             商务 技术 报价 评审 评分 标准
@@ -153,9 +150,6 @@ class DocumentPreReview():
                 tables_list.append(partial_form)
         return tables_list
 
-
-
-
     def get_table(self):
         ''' parse the Bidding_tables.json file to get the table data from it.
         '''
@@ -180,176 +174,150 @@ class DocumentPreReview():
             title_len = partial_form['title_len']
             tables = partial_form["table"]
             
-            if '投标人须知前附表' == table_name:  
-                record_page = page_number[0]
-            if page_number[0] < record_page + 3: 
-                for table in tables[1:]:
-                    if '条' in table: continue    # 存在BUG            
-                    try:
-                        if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
-                        if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
-                    except:
-                        logger.error('该文件中的投标人须知前附表部分表格没有边框,只有中间部分表格存在边框,提取代码认为只有边框存在才被判定为表格内容')
-            
-            form_sign = re.findall('评\w+法前附表',table_name)
-            if form_sign:
-                table_page_num = page_number[-1]
-                inital_data = tables[0]
-                # confirm data location
-                regulation_number_index = inital_data.index("条款号")
-                evaluation_factor_index = inital_data.index("评审因素")
-                evaluation_criteria_index = inital_data.index("评审标准")
-
-                for table in tables[1:]:
-                    tag = table[regulation_number_index+1]
-                    if tag: tag = tag.strip().replace("\n","")
-                    if tag:
-                        tag_sign = tag
-                    evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
-                    if tag_sign in tag_dict: 
-                        tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
-                                                "评审标准":evaluation_criteria.strip().replace("\n","")})
-                    if '评分因素' in table or '评分标准' in table:
-                        scrutinize_page = table_page_num
-                        scrutinize_Initial_title_len = title_len
-                if not scrutinize_page: scrutinize_page = table_page_num+1
-
-            ''' scrutinize '''
-            if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
-                regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
-                scrutinize_sign = True
-                if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
-                for table in tables:
-                    if '评分因素' in table and '评分标准' in table:
-                        regulation_number_index_ = table.index("条款号")
-                        evaluation_factor_index = table.index("评分因素")
-                        evaluation_criteria_index = table.index("评分标准")
-                        weights_index = table.index("权重")
-                        tag_sign_ = ''
-                        scrutinize_index = tables.index(table)
-                        break
-                    elif '评分因素' in table and '评分标准' not in table:
-                        scrutinize_index = tables.index(table)
-                        table_split = table[-1].replace(' ','').split()
-                        if '评分标准' in table_split and '权重' in table_split:
-                            table = table[:-1]
-                            table.extend(table_split)
-                        regulation_number_index_ = table.index("条款号")
-                        evaluation_factor_index = table.index("评分因素")
-                        evaluation_criteria_index = table.index("评分标准")
-                        weights_index = table.index("权重")
-                        tag_sign_ = ''
-                        break
-                if scrutinize_index != -1:
-                    for table in tables[scrutinize_index+1:]:
+            if 30 < page_number[0] < 50:
+                form_sign = re.findall('评\w+法前附表',table_name)
+                if form_sign:
+                    table_page_num = page_number[-1]
+                    inital_data = tables[0]
+                    # confirm data location
+                    regulation_number_index = inital_data.index("条款号")
+                    evaluation_factor_index = inital_data.index("评审因素")
+                    evaluation_criteria_index = inital_data.index("评审标准")
+
+                    for table in tables[1:]:
+                        tag = table[regulation_number_index+1]
+                        if tag: tag = tag.strip().replace("\n","")
+                        if tag:
+                            tag_sign = tag
+                        evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
+                        if tag_sign in tag_dict: 
+                            tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
+                                                    "评审标准":evaluation_criteria.strip().replace("\n","")})
+                        if '评分因素' in table or '评分标准' in table:
+                            scrutinize_page = table_page_num
+                            scrutinize_Initial_title_len = title_len
+                    if not scrutinize_page: scrutinize_page = table_page_num+1
+
+                ''' scrutinize '''
+                if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
+                    regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
+                    scrutinize_sign = True
+                    if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
+                    for table in tables:
+                        if '评分因素' in table and '评分标准' in table:
+                            regulation_number_index_ = table.index("条款号")
+                            evaluation_factor_index = table.index("评分因素")
+                            evaluation_criteria_index = table.index("评分标准")
+                            weights_index = table.index("权重")
+                            tag_sign_ = ''
+                            scrutinize_index = tables.index(table)
+                            break
+                        elif '评分因素' in table and '评分标准' not in table:
+                            scrutinize_index = tables.index(table)
+                            table_split = table[-1].replace(' ','').split()
+                            if '评分标准' in table_split and '权重' in table_split:
+                                table = table[:-1]
+                                table.extend(table_split)
+                            regulation_number_index_ = table.index("条款号")
+                            evaluation_factor_index = table.index("评分因素")
+                            evaluation_criteria_index = table.index("评分标准")
+                            weights_index = table.index("权重")
+                            tag_sign_ = ''
+                            break
+                    if scrutinize_index != -1:
+                        for table in tables[scrutinize_index+1:]:
+                            if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
+                            elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
+                            else: tag = table[regulation_number_index_]
+                            if tag: 
+                                tag = tag.strip().replace("\n","")
+                                tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
+                            if tag and self._scrutinize_judge(tag):
+                                tag_sign_ = tag
+                                if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
+                            try:
+                                evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                            except Exception as e:
+                                print(e)
+                            if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
+                            else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
+                                            "评分标准":evaluation_criteria.strip().replace("\n",""),
+                                            "权重":weights.strip().replace("\n","")}
+                            scrutinize_dict[tag_sign_].append(value)
+                            if table[regulation_number_index_]:
+                                if table[regulation_number_index_][0] == '3':
+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                                    scrutinize_Initial_title_len = 0
+                                    break
+                elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
+                    difference_value = scrutinize_Initial_title_len - title_len
+                    if difference_value:
+                        table_length = len(table)
+                        evaluation_factor_index -= difference_value
+                        evaluation_criteria_index -= difference_value
+                        weights_index -= difference_value
+                        if weights_index >= table_length:
+                            evaluation_factor_index = table_length-3
+                            evaluation_criteria_index = table_length-2
+                            weights_index = table_length-1
+                    for table in tables:
+                        if not table[2]:
+                            scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
+                            continue
                         if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
                         elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
                         else: tag = table[regulation_number_index_]
                         if tag: 
                             tag = tag.strip().replace("\n","")
-                            tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
+                            tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
                         if tag and self._scrutinize_judge(tag):
                             tag_sign_ = tag
                             if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
-                        try:
-                            evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
-                        except:
-                            print()
-                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
+                        evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
                         else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
                                         "评分标准":evaluation_criteria.strip().replace("\n",""),
                                         "权重":weights.strip().replace("\n","")}
                         scrutinize_dict[tag_sign_].append(value)
                         if table[regulation_number_index_]:
-                            if table[regulation_number_index_][0] == '3':
-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
-                                scrutinize_Initial_title_len = 0
-                                break
-            elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
-                difference_value = scrutinize_Initial_title_len - title_len
-                if difference_value:
-                    table_length = len(table)
-                    evaluation_factor_index -= difference_value
-                    evaluation_criteria_index -= difference_value
-                    weights_index -= difference_value
-                    if weights_index >= table_length:
-                        evaluation_factor_index = table_length-3
-                        evaluation_criteria_index = table_length-2
-                        weights_index = table_length-1
-                for table in tables:
-                    if not table[2]:
-                        scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
-                        continue
-                    if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
-                    elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
-                    else: tag = table[regulation_number_index_]
-                    if tag: 
-                        tag = tag.strip().replace("\n","")
-                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
-                    if tag and self._scrutinize_judge(tag):
-                        tag_sign_ = tag
-                        if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
-                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
-                    if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
-                    else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
+                                if table[regulation_number_index_][0] == '3':
+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                                    scrutinize_Initial_title_len = 0
+                                    break
+                elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
+                    difference_value = scrutinize_Initial_title_len - title_len
+                    if difference_value:
+                        evaluation_factor_index -= difference_value
+                        evaluation_criteria_index -= difference_value
+                        weights_index -= difference_value
+                    for table in tables:
+                        if not table[2]:
+                            scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
+                            continue
+                        if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
+                        elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
+                        else: tag = table[regulation_number_index_]
+                        if tag: 
+                            tag = tag.strip().replace("\n","")
+                            tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
+                        if tag and self._scrutinize_judge(tag):
+                            tag_sign_ = tag
+                            if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
+                        evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
+                        else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
                                     "评分标准":evaluation_criteria.strip().replace("\n",""),
                                     "权重":weights.strip().replace("\n","")}
-                    scrutinize_dict[tag_sign_].append(value)
-                    if table[regulation_number_index_]:
-                            if table[regulation_number_index_][0] == '3':
-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
-                                scrutinize_Initial_title_len = 0
-                                break
-            elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
-                difference_value = scrutinize_Initial_title_len - title_len
-                if scrutinize_Initial_title_len:
-                    evaluation_factor_index -= difference_value
-                    evaluation_criteria_index -= difference_value
-                    weights_index -= difference_value
-                for table in tables:
-                    if not table[2]:
-                        scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
-                        continue
-                    if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
-                    elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
-                    else: tag = table[regulation_number_index_]
-                    if tag: 
-                        tag = tag.strip().replace("\n","")
-                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
-                    if tag and self._scrutinize_judge(tag):
-                        tag_sign_ = tag
-                        if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
-                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
-                    if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
-                    else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
-                                "评分标准":evaluation_criteria.strip().replace("\n",""),
-                                "权重":weights.strip().replace("\n","")}
-                    scrutinize_dict[tag_sign_].append(value)
-                    if table[regulation_number_index_]:
-                            if table[regulation_number_index_][0] == '3':
-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
-                                scrutinize_Initial_title_len = 0
-                                break
+                        scrutinize_dict[tag_sign_].append(value)
+                        if table[regulation_number_index_]:
+                                if table[regulation_number_index_][0] == '3':
+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
+                                    scrutinize_Initial_title_len = 0
+                                    break
 
-        pprint(scrutinize_dict)
         return scrutinize_dict
 
 
-
-from fastapi import FastAPI
-import uvicorn
-app = FastAPI()
-
-@app.post('get_pre_review')
-def get_pre_review():
-    
-    result = {
-        "":""
-    }
-    return result
-
-
-
 if __name__ == '__main__':
     path_list = []
     for path_ in path_list:

+ 26 - 9
extract_financial_report.py

@@ -26,9 +26,22 @@ def is_price(word: str) -> bool:
         return False
 
 
-def extract_financial_report(path: str, year: int = None):
-    instances = get_instances_by_title(path,
-                                       ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)])
+def extract_financial_report(path: str, year: int = None) -> list:
+    """
+    财报解析
+
+    Args:
+        path:
+        year:
+
+    Returns:
+        results 
+    """
+    instances = get_instances_by_title(
+        path,
+        ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
+    )
+
     results = []
     ocr_agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
     for item in instances:
@@ -138,12 +151,16 @@ if __name__ == '__main__':
     os.environ["TRANSFORMERS_OFFLINE"] = '1'
 
     y = datetime.datetime.now().year
-    print(extract_financial_report(
-        '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
-        # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
-        # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-        2022
-    ))
+
+    print(
+        extract_financial_report(
+            '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
+            # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
+            # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
+            2022
+        )
+    )
+
     # start = time.time()
     # fs = scan_dir('/home/zzh/ocr/pdf/', 'pdf')
     #

+ 38 - 12
extract_price.py

@@ -1,3 +1,7 @@
+import os
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+os.environ['HF_DATASETS_OFFLINE'] = '1'
+
 from re import findall
 from typing import List
 
@@ -55,16 +59,38 @@ def match_quality(text: str) -> List[str]:
 
 
 if __name__ == '__main__':
-    price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
-                             '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-                             match_price_zhs)
-    price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
-                             '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-                             match_price_num)
-    duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
-                            '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-                            match_duration)
-    quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
-                           '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-                           match_quality)
+    from pprint import pprint
+
+    pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
+    price_zhs = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['人民币投标总报价'],
+        pdf_path,
+        match_price_zhs
+    )
+    price_num = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['人民币投标总报价'],
+        pdf_path,
+        match_price_num
+    )
+    duration = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['工期日历天'],
+        pdf_path,
+        match_duration
+    )
+    quality = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['工程质量'],
+        pdf_path,
+        match_quality
+    )
     valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
+    pprint({
+        "price_zhs": price_zhs,
+        "price_num": price_num,
+        "duration": duration,
+        "quality": quality,
+        "valid": valid
+    })

+ 73 - 5
get_info.py

@@ -2,7 +2,7 @@
 # @Author: privacy
 # @Date:   2024-06-11 13:43:14
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-08-08 17:07:49
+# @Last Modified time: 2024-08-27 14:50:15
 
 # import os
 
@@ -80,6 +80,7 @@ import re
 import json
 from io import BytesIO
 from pprint import pprint
+from typing import Optional
 
 # 第三方包导入
 import cv2
@@ -102,17 +103,41 @@ import camelot
 # 自定义包导入
 from tools import RefPageNumberResolver
 
+
+PIL_ERROR_MESSAGE = "No module named 'PIL', please run 'pip install pillow'"
+
+
 HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
 
 
+pattern_1 = re.compile(r'^\d(\d*\.?\d*)+\d(%)?')
+pattern_2 = re.compile('^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\(\(][一二三四五六七八九十]+[\)\)]')
+pattern_3 = re.compile('^附录|^参考文献|^附表')
+
+
 def is_title(line: str) -> bool:
+    """
+    判断某行文本释放为标题
+
+    Args:
+        line: 文本行
+
+    Results:
+        是否是标题
+    """
+    # if re.fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line.strip()):
+    if pattern_1.fullmatch(line.strip()):
+        return False
+
     # title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
-    title_word = re.findall('^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\(\(][一二三四五六七八九十]+[\)\)]', line.strip())
+    title_word = pattern_2.findall(line.strip())
     if title_word:
         return True
-    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
+
+    title_word = pattern_3.findall(line.strip())
     if title_word:
         return True
+
     return False
 
 def export_image(image: LTImage, path: str) -> str:
@@ -332,6 +357,38 @@ class PdfExtractAttr(object):
         self.detail_df = None
         self.outlines = None
 
+    def parse_title(self) -> list:
+        """
+        标题解析
+        """
+        texts = []
+
+        for page_number, page_layout in enumerate(extract_pages(self.file_path)):
+            title_index = 0
+            for element in page_layout:
+                if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
+                    text = element.get_text().strip()
+                    if text and (is_title(text) or element.height > 15):
+                        texts.append({
+                            'index': title_index,
+                            'page_number': page_number,
+                            'bbox': element.bbox,
+                            'text': text
+                        })
+                        title_index += 1
+
+        results = []
+
+        for i, text in enumerate(texts):
+            results.append({
+                'title': text['text'],
+                'index': text['index'],
+                'page_number': text['page_number'],
+                'seq_num': i
+            })
+
+        return results
+
     def can_merge_lines(self, line1, line2) -> bool:
         """判断两行文本是否可以合并为一段
         """
@@ -468,7 +525,6 @@ class PdfExtractAttr(object):
                     # 计算页码
                     page_n = int(re.findall('\d+$', line).pop())
                     # 添加到目录结构中
-                    # directory_structure.append({
                     results.append({
                         "level": indent_level,
                         "title": title,
@@ -481,6 +537,8 @@ class PdfExtractAttr(object):
         return results
 
     def extract_content(self, content_path: str = None) -> list:
+        self.content = []
+
         with pdfplumber.open(self.file_path) as pdf:
             for page in pdf.pages:
                 self.content.append({
@@ -535,7 +593,7 @@ class PdfExtractAttr(object):
 
         return results
 
-    def parse_text(self) -> None:
+    def parse_text(self, text_path: Optional[str] = None) -> None:
         """文本解析
         """
         for page_number, page_layout in enumerate(extract_pages(self.file_path)):
@@ -569,8 +627,15 @@ class PdfExtractAttr(object):
                         'text': element.get_text().strip(),
                         'is_table_name': element.get_text().strip().endswith('表')
                     })
+
+        if text_path:
+            with open(text_path, 'w', encoding='utf-8') as fp:
+                json.dump(self.details, fp, indent=4, ensure_ascii=False)
+
         self.detail_df = pd.DataFrame(self.details)
 
+        return self.details
+
     def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
         """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
         @table
@@ -607,6 +672,8 @@ class PdfExtractAttr(object):
     def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
         """表格解析
         """
+        self.tables = []
+
         if self.detail_df == None:
             self.parse_text()
 
@@ -660,6 +727,7 @@ class PdfExtractAttr(object):
         return self.tables
 
 
+
 if __name__ == '__main__':
     pdf_path = './投标文件-修改版9-5-1-1.pdf'
     # pdf_path = './南方电网数字研究院有限公司.pdf'

+ 36 - 22
instance_locate.py

@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal
 from pprint import pprint
@@ -46,51 +46,65 @@ def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int,
         json.dump(texts, fp, indent=4, ensure_ascii=False)
 
 
-def get_instances_by_title(path: str, instances: List[str]):
+def get_instances_by_title(path: Optional[str] = None, title_list: List[dict], table_list: List[dict], instances: List[str] = ['近年财务状况表']):
     """
     Get all tables and figures of given title
+
+    Args:
+        path:
+        title_list: PDF 标题
+        table_list: PDF 表格
+        instances:
+
+    Returns:
+        results
     """
 
-    # path = './投标文件-修改版9-5-1-1.pdf'
-    # instances = ['近年财务状况表']
-    file = PdfExtractAttr(file_path=path)
-    print('解析PDF文字中')
-    file.parse_text()
-    # title = file.parse_outline()
-    print('解析PDF标题中')
-    all_title = parse_title(path)
-    # all_text = file.parse_text()  # remain for external parse
-
-    print('分析标题中')
-    title_sims = similarity_filter(similar_match(all_title, instances, key='title'), 0.5)
+    title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
+
     title_f = [i for i in title_sims]
+
     results = []
+
     for i in title_f:
         try:
-            i['end_page'] = all_title[i['seq_num'] + 1]['page_number'] - 1
+            i['end_page'] = title_list[i['seq_num'] + 1]['page_number'] - 1
             if i['end_page'] <= i['page_number']:
                 continue
-            # i['end_page'] = all_title[i['seq_num']]['page_number'] + 5  # for debug
         except IndexError:
             i['end_page'] = float('inf')
 
-        image_loc = os.path.join(os.path.dirname(path), 'images')
+        image_loc = os.path.join(os.path.dirname(path), 'extracted_images')
+
         if not os.path.exists(image_loc):
             os.makedirs(image_loc)
+
         print('解析标题:\t{}'.format(i['title']))
+
         print('解析图片中')
-        parse_pages(path, os.path.join(os.path.dirname(path),
-                                       '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
-                    image_loc, i['page_number'], i['end_page'], file.total_page)
 
-        table_loc = os.path.join(os.path.dirname(path),
-                                 '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
+        parse_pages(
+            path,
+            os.path.join(os.path.dirname(path), '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
+            image_loc,
+            i['page_number'],
+            i['end_page'],
+            file.total_page
+        )
+
+        table_loc = os.path.join(os.path.dirname(path), '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
+
         print('解析表格中')
+
         tables = file.parse_table(start=i['page_number'], end=i['end_page'])
+
         i['tables'] = tables
+
         with open(table_loc, 'w', encoding='utf-8') as fp:
             json.dump(tables, fp, indent=4, ensure_ascii=False)
+
         i.update({'table_loc': table_loc, 'image_loc': image_loc})
+
         results.append(i)
 
     return results

+ 9 - 5
matcher.py

@@ -2,7 +2,11 @@
 # @Author: privacy
 # @Date:   2024-06-27 09:33:01
 # @Last Modified by:   privacy
-# @Last Modified time: 2024-06-27 14:44:43
+# @Last Modified time: 2024-08-23 12:10:09
+import os
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+os.environ['HF_DATASETS_OFFLINE'] = '1'
+
 import torch
 import numpy as np
 import pandas as pd
@@ -29,18 +33,18 @@ class Matcher:
         return pd.Series([most_similar_keyword, max(similarities)])
 
     def get_embedding(self, text: str):
-        encoded_input = tokenizer(text, return_tensors='pt')
+        encoded_input = self.tokenizer(text, return_tensors='pt')
         with torch.no_grad():
-            output = model(**encoded_input)
+            output = self.model(**encoded_input)
         text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
         return text_embedding
     
     def get_embeddings(self, text_list: list) -> list:
         text_embeddings = []
         for text in text_list:
-            encoded_input = tokenizer(text, return_tensors='pt')
+            encoded_input = self.tokenizer(text, return_tensors='pt')
             with torch.no_grad():
-                output = model(**encoded_input)
+                output = self.model(**encoded_input)
             text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
         return text_embeddings
 

+ 3 - 2
ocr_api.py

@@ -9,7 +9,7 @@ class OcrAgent:
     def __init__(self, url):
         self.url = url
 
-    def get_content(self, image_path):
+    def get_content(self, image_path: str) -> dict:
         try:
             with open(image_path, 'rb') as image_file:
                 files = {"file": ("image.jpg", image_file, "image/jpeg")}
@@ -38,5 +38,6 @@ def find_current_row(ocr_result: List[dict], top: int, bottom: int, float_range:
 if __name__ == '__main__':
     agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
     res = agent.get_content(
-        os.path.join('/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/images', 'image_page_1131_0.png'))
+        os.path.join('D:\\desktop\\三峡水利\\data\\projects\\2022-2025年度三峡电站9台机组检修密封加工制作重新招标\\投标\\东方电气\\extracted_images\\', 'image_page_27_1.jpg'))
+    print(res)
     pass

+ 47 - 19
project_loc.py

@@ -1,7 +1,11 @@
-from typing import List
-from get_info import PdfExtractAttr
-# from scan_dir import scan_dir
 import time
+import json
+from typing import (
+    List,
+    Optional
+)
+
+from get_info import PdfExtractAttr
 
 
 ins = ['合同金额', '合同价格', '发包人名称']
@@ -14,10 +18,32 @@ def batch_bool(instances: List[str], text: str) -> bool:
     return False
 
 
-def extract_project(path: str, instances: List[str]):
-    agent = PdfExtractAttr(file_path=path)
-    tables = agent.parse_table_pro()
+def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list:
+    """
+    从表格中抽取项目业绩
+
+    Args:
+        instance:   抽取的字段
+        table_dict: json表格
+        table_path: 表格文件路径
+        pdf_path:   pdf源文件路径
+
+    Returns:
+        res 项目业绩表
+    """
+    if table_dict:
+        tables = table_dict
+    elif table_path:
+        with open(table_path, 'r', encoding='utf-8') as jsonfile:
+            tables = json.load(jsonfile)
+    elif pdf_path:
+        agent = PdfExtractAttr(file_path=pdf_path)
+        tables = agent.parse_table_pro()
+    else:
+        raise ValueError("请输入需要解析的文件!")
+
     res = []
+
     for table in tables:
 
         tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
@@ -26,7 +52,6 @@ def extract_project(path: str, instances: List[str]):
         rows = [row[0] for row in tab]
 
         for i in rows:
-            # if '合同金额' in i or '合同价格' in i or '发包人名称' in i:
             if batch_bool(instances, i):
                 res.append({
                     "page_numbers": pages,
@@ -38,16 +63,19 @@ def extract_project(path: str, instances: List[str]):
 
 
 if __name__ == '__main__':
-    # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf')
-    # start = time.time()
-    # for f in (fs[:]):
-    #     try:
-    #         print(f)
-    #         print(extract_project(f, ins))
-    #         print('\n*********Runtime {} s *********\n'.format(time.time() - start))
-    #     except BaseException as e:
-    #         print('Something wrong')
-    #         print(e)
-
-    print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))
+    from pprint import pprint
+
+    file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf'
+
+    table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json'
+
+    with open(table_path, 'r', encoding='utf-8') as jsonfile:
+        tables = json.load(jsonfile)
+
+    pprint(
+        extract_project(
+            instances=['合同金额', '合同价格', '发包人名称'],
+            table_dict=tables
+        )
+    )
 

+ 3 - 1
requirements.txt

@@ -9,4 +9,6 @@ transformers==4.41.2
 textrank4zh==0.3
 jieba==0.42.1
 camelot==0.20.1
-PyMuPDF==1.24.9
+PyMuPDF==1.24.9
+celery==5.4.0
+redis==5.0.8

+ 70 - 36
text_extractor.py

@@ -6,39 +6,43 @@ from pdfminer.pdfparser import PDFParser
 from matcher import Matcher
 from get_info import PdfExtractAttr, is_title
 from typing import Callable, Union, List, Tuple, Dict
-from re import fullmatch
 from tqdm import tqdm
 import pandas as pd
 
 
-def absolute_not_title(line: str) -> bool:
-    if fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line):
-        return True
-    else:
-        return False
 
+# def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
+#     """
+#     标题解析
 
-def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
-    texts = []
-    for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
-                                         total=resolve1(PDFDocument(
-                                             PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
-                                         ):
-        title_index = 0
-        for element in page_layout:
-            if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
-                text = element.get_text().strip()
-                if text and (is_title(text) or element.height > 15) and (not absolute_not_title(text)):
-                    texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
-                    title_index += 1
-    results = []
-    for i, text in enumerate(texts):
-        results.append({'title': text['text'],
-                        'index': text['index'],
-                        'page_number': text['page_number'],
-                        'seq_num': i
-                        })
-    return results
+#     Args:
+#         pdf_path: PDF文件路径
+
+#     Returns:
+#         results
+#     """
+#     texts = []
+
+#     for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
+#                                          total=resolve1(PDFDocument(
+#                                              PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
+#                                          ):
+#         title_index = 0
+#         for element in page_layout:
+#             if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
+#                 text = element.get_text().strip()
+#                 if text and (is_title(text) or element.height > 15):
+#                     texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
+#                     title_index += 1
+#     results = []
+
+#     for i, text in enumerate(texts):
+#         results.append({'title': text['text'],
+#                         'index': text['index'],
+#                         'page_number': text['page_number'],
+#                         'seq_num': i
+#                         })
+#     return results
 
 
 def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
@@ -100,27 +104,56 @@ def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
     return max_sim_rows.to_dict(orient='records')
 
 
-def get_instance(title_instances: List[str], content_instances: List[str], pdf: str,
+def get_instance(title_instances: List[str],
+                 content_instances: List[str],
+                 pdf_path: str,
                  extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
-                 page_bias: int = 1, similarity: float = None):
-    file = PdfExtractAttr(file_path=pdf)
-    # titles = file.parse_outline()
-    titles = parse_title(pdf)
+                 page_bias: int = 1,
+                 similarity: float = None
+                ):
+    """
+    Args:
+        title_instances
+        content_instances
+        file_path
+        extractor
+        page_bias
+        similarity
+
+    Returns:
+        results
+    """
+    file = PdfExtractAttr(file_path=pdf_path)
+    titles = file.parse_title()
     texts = file.parse_text()
 
-    title_sims = similarity_filter(similar_match(titles, title_instances, key='title'), similarity)
+    title_sims = similarity_filter(
+        similar_match(
+            titles,
+            title_instances,
+            key='title'
+        ),
+        similarity
+    )
+
     results = []
+
     for i in title_sims:
         current_page = i['page_number']
         _, text = pagination_texts(texts, current_page, current_page + page_bias)
         results.extend(extract_from_texts(text, extractor, content_instances))
+
     return results
 
 
 if __name__ == '__main__':
-    # price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
-    #                          '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
-    #                          match_price_zhs)
+    pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
+    price_zhs = get_instance(
+        title_instances=['投标函', '开标一览表'],
+        content_instances=['人民币投标总报价'],
+        pdf_path=pdf_path,
+        extractor=match_price_zhs
+    )
     # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
     #                          '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
     #                          match_price_num)
@@ -133,4 +166,5 @@ if __name__ == '__main__':
     # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
     # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
     # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
+    print(price_zhs)
     pass