1 năm trước cách đây · b66967d53c
--- a/README.md
+++ b/README.md
@@ -1,6 +1,155 @@
 
				+服务架构
			
 
				+---
			
 
				+
			
 
				+![image-20240826152353199](image-20240826152353199.png)
			
 
				+
			
 
				+流程结构
			
 
				+---
			
 
				+
			
 
				+```mermaid
			
 
				+graph LR
			
 
				+    工厂[工厂] --> 创建工厂
			
 
				+    创建工厂 --> Celery实例
			
 
				+    创建工厂 --> Flask实例
			
 
				+    Celery实例 --> Redis中间件
			
 
				+    Celery实例 --> Worker1
			
 
				+    Celery实例 --> Worker2
			
 
				+    Celery实例 --> Worker3
			
 
				+    Celery实例 --> Worker4
			
 
				+    Celery实例 --> Worker5
			
 
				+    Flask实例 --> submit
			
 
				+    submit --> Redis中间件
			
 
				+    Redis中间件 --> Worker1
			
 
				+    Redis中间件 --> Worker2
			
 
				+    Redis中间件 --> Worker3
			
 
				+    Redis中间件 --> Worker4
			
 
				+    Redis中间件 --> Worker5
			
 
				+    Worker1 --> ALGNODE1
			
 
				+    Worker2 --> ALGNODE2
			
 
				+    Worker3 --> ALGNODE3
			
 
				+    Worker4 --> ALGNODE4
			
 
				+    Worker5 --> ALGNODE5
			
 
				+```
			
 
				+
			
 
				+
			
 
				+```mermaid
			
 
				+graph LR
			
 
				+    ALGNODE[ALGNODE] --> 通用PDF抽取模块
			
 
				+    通用PDF抽取模块 --> 图片解析
			
 
				+    通用PDF抽取模块 --> 表格解析
			
 
				+    通用PDF抽取模块 --> 正文解析
			
 
				+    通用PDF抽取模块 --> 标题解析
			
 
				+    图片解析 --> 图片OCR文本
			
 
				+    表格解析 --> 表格内容
			
 
				+    正文解析 --> 正文内容
			
 
				+    标题解析 --> 标题内容
			
 
				+    图片OCR文本 --> 通用数据回收
			
 
				+    表格内容 --> 通用数据回收
			
 
				+    正文内容 --> 通用数据回收
			
 
				+    标题内容 --> 通用数据回收
			
 
				+    通用数据回收 --> 采购数据
			
 
				+    通用数据回收 --> 投标数据
			
 
				+    采购数据 --> 预审
			
 
				+    投标数据 --> 预审
			
 
				+    采购数据 --> 初审
			
 
				+    投标数据 --> 初审
			
 
				+    采购数据 --> 详审
			
 
				+    投标数据 --> 详审
			
 
				+    采购数据 --> 报价评审
			
 
				+    投标数据 --> 报价评审
			
 
				+    预审 --> 回传
			
 
				+    初审 --> 回传
			
 
				+    详审 --> 回传
			
 
				+    报价评审 --> 回传
			
 
				+```
			
 
				+
			
 
				+代码层
			
 
				+---
			
 
				+
			
 
				+### 异步配置文件(backend/config.py)
			
 
				+
			
 
				+```python
			
 
				+class Config:
			
 
				+    CELERY = dict(
			
 
				+        CELERY_BROKER_URL = 'redis://localhost:6379/0',
			
 
				+        CELERY_RESULT_BACKEND = 'redis://localhost:6379/0',
			
 
				+        include = "backend.celery_task",
			
 
				+        task_ignore_result = True,
			
 
				+        timezone = 'Asia/Shanghai',
			
 
				+        enable_utc = False,
			
 
				+        task_track_started = True
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+### 创建工厂(backend/__init__.py)
			
 
				+
			
 
				+```python
			
 
				+from flask import Flask
			
 
				+from celery import Celery, Task
			
 
				+
			
 
				+celery_app = Celery(__name__)
			
 
				+
			
 
				+def create_app(test_config: dict = None) -> Flask:
			
 
				+    app = Flask(__name__)
			
 
				+
			
 
				+    class FlaskTask(Task):
			
 
				+        def __call__(self, *args: object, **kwargs: object) -> object:
			
 
				+            with app.app_context():
			
 
				+                return self.run(*args, **kwargs)
			
 
				+
			
 
				+    if test_config is None:
			
 
				+        app.config.from_pyfile('config.py', silent=True)
			
 
				+    else:
			
 
				+        app.config.from_mapping(test_config)
			
 
				+
			
 
				+    celery_app.config_from_object(app.config['CELERY'])
			
 
				+    celery_app.Task = FlaskTask
			
 
				+    celery_app.set_default()
			
 
				+
			
 
				+    return app
			
 
				+```
			
 
				+
			
 
				+### 封包脚本(make_celery.py)
			
 
				+
			
 
				+```python
			
 
				+from backend import (
			
 
				+    create_app,
			
 
				+    celery_app,
			
 
				+)
			
 
				+
			
 
				+app = create_app()
			
 
				+```
			
 
				+
			
 
				+### WEB配置 (gunicorn_config.py)
			
 
				+
			
 
				+```python
			
 
				+workers = 1
			
 
				+bind = "0.0.0.0:8000"
			
 
				+backlog = 2048
			
 
				+loglevel = "INFO"
			
 
				+daemon = True
			
 
				+pidfile = "/var/run/backend.pid"
			
 
				+accesslog = "/var/log/backend/access.log"
			
 
				+errorlog = "/var/log/backend/error.log"
			
 
				+```
			
 
				+
			
 
				+### 执行脚本(run.sh)
			
 
				+
			
 
				+```bash
			
 
				+gunicorn --config gunicorn_config.py make_celery
			
 
				+celery multi start worker -A make_celery:celery_app -P prefork -E --loglevel=INFO --logfile=/var/log/celery/%n%I.log --pidfile=/run/celery/%n.pid
			
 
				+celery -A make_celery:celery_app events
			
 
				+```
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 主要模块描述
			
 
				+---
			
 
				+
			
 
				 1、tools     大纲解析模块
			
 
				 2、get_info  PDF信息抽取模块
			
 
				 3、matcher   段落定位模块
			
@@ -11,6 +160,13 @@
 
				 7、LLMAgent      大模型调用模块
			
 
				 8、document_     招标文件解析模块
			
 
				 
			
 
				+9、extract_price 报价抽取
			
 
				+
			
 
				+10、doc2pdf      word文档转pdf模块
			
 
				+
			
 
				+11、extract_financial_report 财报抽取
			
 
				+
			
 
				+12、ocr_api      图片OCR类
			
 
				 
			
 
				 ##### PDF中无边框表格内容抽取
			
 
				 ```
			
--- a/document_.py
+++ b/document_.py
@@ -25,50 +25,47 @@ chinese_num_map = {
 
				     '十': 10
			
 
				 } 
			
 
				 
			
 
				-def create_logger(log_path):
			
 
				-    """
			
 
				-    将日志输出到日志文件和控制台
			
 
				-    """
			
 
				-    logger = logging.getLogger()
			
 
				-    logger.setLevel(logging.INFO)
			
 
				-
			
 
				-    formatter = logging.Formatter(
			
 
				-        '%(asctime)s - %(levelname)s - %(message)s')
			
 
				-
			
 
				-    # 创建一个handler，用于写入日志文件
			
 
				-    file_handler = logging.FileHandler(
			
 
				-        filename=log_path, mode='w')
			
 
				-    file_handler.setFormatter(formatter)
			
 
				-    file_handler.setLevel(logging.INFO)
			
 
				-    logger.addHandler(file_handler)
			
 
				-
			
 
				-    # 创建一个handler，用于将日志输出到控制台
			
 
				-    console = logging.StreamHandler()
			
 
				-    console.setLevel(logging.DEBUG)
			
 
				-    console.setFormatter(formatter)
			
 
				-    logger.addHandler(console)
			
 
				-
			
 
				-    return logger
			
 
				-
			
 
				-log_path = "code/logs/logs.log" 
			
 
				-logger = create_logger(log_path=log_path)
			
 
				-
			
 
				-class DocumentPreReview():
			
 
				-    def __init__(self, file_path) -> None:
			
 
				+# def create_logger(log_path):
			
 
				+#     """
			
 
				+#     将日志输出到日志文件和控制台
			
 
				+#     """
			
 
				+#     logger = logging.getLogger()
			
 
				+#     logger.setLevel(logging.INFO)
			
 
				+
			
 
				+#     formatter = logging.Formatter(
			
 
				+#         '%(asctime)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+#     # 创建一个handler，用于写入日志文件
			
 
				+#     file_handler = logging.FileHandler(
			
 
				+#         filename=log_path, mode='w')
			
 
				+#     file_handler.setFormatter(formatter)
			
 
				+#     file_handler.setLevel(logging.INFO)
			
 
				+#     logger.addHandler(file_handler)
			
 
				+
			
 
				+#     # 创建一个handler，用于将日志输出到控制台
			
 
				+#     console = logging.StreamHandler()
			
 
				+#     console.setLevel(logging.DEBUG)
			
 
				+#     console.setFormatter(formatter)
			
 
				+#     logger.addHandler(console)
			
 
				+
			
 
				+#     return logger
			
 
				+
			
 
				+# log_path = "./logs.log" 
			
 
				+# logger = create_logger(log_path=log_path)
			
 
				+
			
 
				+class DocumentPreReview:
			
 
				+    def __init__(self) -> None:
			
 
				         self.bm = BaseMethods()
			
 
				-        self.Bidding_tables = self.get_Bidding_table(file_path)
			
 
				-    
			
 
				-    
			
 
				+
			
 
				     def get_Bidding_table(self, file_path:str):
			
 
				         ''' get table data
			
 
				         '''
			
 
				         # file_path = "data/预审查数据/三峡左岸及电源电站中央空调系统管网及末端改造（发布稿）-table.json"
			
 
				         # file_path = "data/预审查数据/2023年档案管理系统功能优化项目采购程序文件-table.json"
			
 
				         all_tables = self.bm.json_read(file_path)
			
 
				+        self.Bidding_tables = all_tables
			
 
				         return all_tables
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				+
			
 
				     def _scrutinize_judge(self, tag:str, threshold_value:int=3):
			
 
				         ''' Clause number content judgment 
			
 
				             商务 技术 报价 评审 评分 标准
			
@@ -153,9 +150,6 @@ class DocumentPreReview():
 
				                 tables_list.append(partial_form)
			
 
				         return tables_list
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				     def get_table(self):
			
 
				         ''' parse the Bidding_tables.json file to get the table data from it.
			
 
				         '''
			
@@ -180,176 +174,150 @@ class DocumentPreReview():
 
				             title_len = partial_form['title_len']
			
 
				             tables = partial_form["table"]
			
 
				             
			
 
				-            if '投标人须知前附表' == table_name:  
			
 
				-                record_page = page_number[0]
			
 
				-            if page_number[0] < record_page + 3: 
			
 
				-                for table in tables[1:]:
			
 
				-                    if '条' in table: continue    # 存在BUG            
			
 
				-                    try:
			
 
				-                        if table[0] and table[0] not in bidder_know: bidder_know[table[0]] = []
			
 
				-                        if table[0]: bidder_know[table[0]].append({"条款名称":table[1],"编列内容":table[2]})
			
 
				-                    except:
			
 
				-                        logger.error('该文件中的投标人须知前附表部分表格没有边框，只有中间部分表格存在边框，提取代码认为只有边框存在才被判定为表格内容')
			
 
				-            
			
 
				-            form_sign = re.findall('评\w+法前附表',table_name)
			
 
				-            if form_sign:
			
 
				-                table_page_num = page_number[-1]
			
 
				-                inital_data = tables[0]
			
 
				-                # confirm data location
			
 
				-                regulation_number_index = inital_data.index("条款号")
			
 
				-                evaluation_factor_index = inital_data.index("评审因素")
			
 
				-                evaluation_criteria_index = inital_data.index("评审标准")
			
 
				-
			
 
				-                for table in tables[1:]:
			
 
				-                    tag = table[regulation_number_index+1]
			
 
				-                    if tag: tag = tag.strip().replace("\n","")
			
 
				-                    if tag:
			
 
				-                        tag_sign = tag
			
 
				-                    evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
			
 
				-                    if tag_sign in tag_dict: 
			
 
				-                        tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
			
 
				-                                                "评审标准":evaluation_criteria.strip().replace("\n","")})
			
 
				-                    if '评分因素' in table or '评分标准' in table:
			
 
				-                        scrutinize_page = table_page_num
			
 
				-                        scrutinize_Initial_title_len = title_len
			
 
				-                if not scrutinize_page: scrutinize_page = table_page_num+1
			
 
				-
			
 
				-            ''' scrutinize '''
			
 
				-            if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
			
 
				-                regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
			
 
				-                scrutinize_sign = True
			
 
				-                if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
			
 
				-                for table in tables:
			
 
				-                    if '评分因素' in table and '评分标准' in table:
			
 
				-                        regulation_number_index_ = table.index("条款号")
			
 
				-                        evaluation_factor_index = table.index("评分因素")
			
 
				-                        evaluation_criteria_index = table.index("评分标准")
			
 
				-                        weights_index = table.index("权重")
			
 
				-                        tag_sign_ = ''
			
 
				-                        scrutinize_index = tables.index(table)
			
 
				-                        break
			
 
				-                    elif '评分因素' in table and '评分标准' not in table:
			
 
				-                        scrutinize_index = tables.index(table)
			
 
				-                        table_split = table[-1].replace(' ','').split()
			
 
				-                        if '评分标准' in table_split and '权重' in table_split:
			
 
				-                            table = table[:-1]
			
 
				-                            table.extend(table_split)
			
 
				-                        regulation_number_index_ = table.index("条款号")
			
 
				-                        evaluation_factor_index = table.index("评分因素")
			
 
				-                        evaluation_criteria_index = table.index("评分标准")
			
 
				-                        weights_index = table.index("权重")
			
 
				-                        tag_sign_ = ''
			
 
				-                        break
			
 
				-                if scrutinize_index != -1:
			
 
				-                    for table in tables[scrutinize_index+1:]:
			
 
				+            if 30 < page_number[0] < 50:
			
 
				+                form_sign = re.findall('评\w+法前附表',table_name)
			
 
				+                if form_sign:
			
 
				+                    table_page_num = page_number[-1]
			
 
				+                    inital_data = tables[0]
			
 
				+                    # confirm data location
			
 
				+                    regulation_number_index = inital_data.index("条款号")
			
 
				+                    evaluation_factor_index = inital_data.index("评审因素")
			
 
				+                    evaluation_criteria_index = inital_data.index("评审标准")
			
 
				+
			
 
				+                    for table in tables[1:]:
			
 
				+                        tag = table[regulation_number_index+1]
			
 
				+                        if tag: tag = tag.strip().replace("\n","")
			
 
				+                        if tag:
			
 
				+                            tag_sign = tag
			
 
				+                        evaluation_factor,evaluation_criteria = table[evaluation_factor_index],table[evaluation_criteria_index]
			
 
				+                        if tag_sign in tag_dict: 
			
 
				+                            tag_dict[tag_sign].append({"评审因素":evaluation_factor.strip().replace("\n",""),
			
 
				+                                                    "评审标准":evaluation_criteria.strip().replace("\n","")})
			
 
				+                        if '评分因素' in table or '评分标准' in table:
			
 
				+                            scrutinize_page = table_page_num
			
 
				+                            scrutinize_Initial_title_len = title_len
			
 
				+                    if not scrutinize_page: scrutinize_page = table_page_num+1
			
 
				+
			
 
				+                ''' scrutinize '''
			
 
				+                if (scrutinize_page == page_number[0] and scrutinize_Initial_title_len) or scrutinize_page == page_number[0]:
			
 
				+                    regulation_number_index_,evaluation_factor_index,evaluation_criteria_index,weights_index = 0,0,0,0
			
 
				+                    scrutinize_sign = True
			
 
				+                    if not scrutinize_Initial_title_len: scrutinize_Initial_title_len = title_len
			
 
				+                    for table in tables:
			
 
				+                        if '评分因素' in table and '评分标准' in table:
			
 
				+                            regulation_number_index_ = table.index("条款号")
			
 
				+                            evaluation_factor_index = table.index("评分因素")
			
 
				+                            evaluation_criteria_index = table.index("评分标准")
			
 
				+                            weights_index = table.index("权重")
			
 
				+                            tag_sign_ = ''
			
 
				+                            scrutinize_index = tables.index(table)
			
 
				+                            break
			
 
				+                        elif '评分因素' in table and '评分标准' not in table:
			
 
				+                            scrutinize_index = tables.index(table)
			
 
				+                            table_split = table[-1].replace(' ','').split()
			
 
				+                            if '评分标准' in table_split and '权重' in table_split:
			
 
				+                                table = table[:-1]
			
 
				+                                table.extend(table_split)
			
 
				+                            regulation_number_index_ = table.index("条款号")
			
 
				+                            evaluation_factor_index = table.index("评分因素")
			
 
				+                            evaluation_criteria_index = table.index("评分标准")
			
 
				+                            weights_index = table.index("权重")
			
 
				+                            tag_sign_ = ''
			
 
				+                            break
			
 
				+                    if scrutinize_index != -1:
			
 
				+                        for table in tables[scrutinize_index+1:]:
			
 
				+                            if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
			
 
				+                            elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
			
 
				+                            else: tag = table[regulation_number_index_]
			
 
				+                            if tag: 
			
 
				+                                tag = tag.strip().replace("\n","")
			
 
				+                                tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
			
 
				+                            if tag and self._scrutinize_judge(tag):
			
 
				+                                tag_sign_ = tag
			
 
				+                                if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
			
 
				+                            try:
			
 
				+                                evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				+                            except Exception as e:
			
 
				+                                print(e)
			
 
				+                            if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				+                            else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
			
 
				+                                            "评分标准":evaluation_criteria.strip().replace("\n",""),
			
 
				+                                            "权重":weights.strip().replace("\n","")}
			
 
				+                            scrutinize_dict[tag_sign_].append(value)
			
 
				+                            if table[regulation_number_index_]:
			
 
				+                                if table[regulation_number_index_][0] == '3':
			
 
				+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				+                                    scrutinize_Initial_title_len = 0
			
 
				+                                    break
			
 
				+                elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
			
 
				+                    difference_value = scrutinize_Initial_title_len - title_len
			
 
				+                    if difference_value:
			
 
				+                        table_length = len(table)
			
 
				+                        evaluation_factor_index -= difference_value
			
 
				+                        evaluation_criteria_index -= difference_value
			
 
				+                        weights_index -= difference_value
			
 
				+                        if weights_index >= table_length:
			
 
				+                            evaluation_factor_index = table_length-3
			
 
				+                            evaluation_criteria_index = table_length-2
			
 
				+                            weights_index = table_length-1
			
 
				+                    for table in tables:
			
 
				+                        if not table[2]:
			
 
				+                            scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
			
 
				+                            continue
			
 
				                         if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
			
 
				                         elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
			
 
				                         else: tag = table[regulation_number_index_]
			
 
				                         if tag: 
			
 
				                             tag = tag.strip().replace("\n","")
			
 
				-                            tag = ''.join(re.findall(r"[\u4e00-\u9fa5]+", tag))
			
 
				+                            tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
			
 
				                         if tag and self._scrutinize_judge(tag):
			
 
				                             tag_sign_ = tag
			
 
				                             if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
			
 
				-                        try:
			
 
				-                            evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				-                        except:
			
 
				-                            print()
			
 
				-                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""),"评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				+                        evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				                         else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
			
 
				                                         "评分标准":evaluation_criteria.strip().replace("\n",""),
			
 
				                                         "权重":weights.strip().replace("\n","")}
			
 
				                         scrutinize_dict[tag_sign_].append(value)
			
 
				                         if table[regulation_number_index_]:
			
 
				-                            if table[regulation_number_index_][0] == '3':
			
 
				-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				-                                scrutinize_Initial_title_len = 0
			
 
				-                                break
			
 
				-            elif scrutinize_page+1 == page_number[0] and scrutinize_sign:
			
 
				-                difference_value = scrutinize_Initial_title_len - title_len
			
 
				-                if difference_value:
			
 
				-                    table_length = len(table)
			
 
				-                    evaluation_factor_index -= difference_value
			
 
				-                    evaluation_criteria_index -= difference_value
			
 
				-                    weights_index -= difference_value
			
 
				-                    if weights_index >= table_length:
			
 
				-                        evaluation_factor_index = table_length-3
			
 
				-                        evaluation_criteria_index = table_length-2
			
 
				-                        weights_index = table_length-1
			
 
				-                for table in tables:
			
 
				-                    if not table[2]:
			
 
				-                        scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
			
 
				-                        continue
			
 
				-                    if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
			
 
				-                    elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
			
 
				-                    else: tag = table[regulation_number_index_]
			
 
				-                    if tag: 
			
 
				-                        tag = tag.strip().replace("\n","")
			
 
				-                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
			
 
				-                    if tag and self._scrutinize_judge(tag):
			
 
				-                        tag_sign_ = tag
			
 
				-                        if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
			
 
				-                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				-                    if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				-                    else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
			
 
				+                                if table[regulation_number_index_][0] == '3':
			
 
				+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				+                                    scrutinize_Initial_title_len = 0
			
 
				+                                    break
			
 
				+                elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
			
 
				+                    difference_value = scrutinize_Initial_title_len - title_len
			
 
				+                    if difference_value:
			
 
				+                        evaluation_factor_index -= difference_value
			
 
				+                        evaluation_criteria_index -= difference_value
			
 
				+                        weights_index -= difference_value
			
 
				+                    for table in tables:
			
 
				+                        if not table[2]:
			
 
				+                            scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
			
 
				+                            continue
			
 
				+                        if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
			
 
				+                        elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
			
 
				+                        else: tag = table[regulation_number_index_]
			
 
				+                        if tag: 
			
 
				+                            tag = tag.strip().replace("\n","")
			
 
				+                            tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
			
 
				+                        if tag and self._scrutinize_judge(tag):
			
 
				+                            tag_sign_ = tag
			
 
				+                            if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
			
 
				+                        evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				+                        if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				+                        else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
			
 
				                                     "评分标准":evaluation_criteria.strip().replace("\n",""),
			
 
				                                     "权重":weights.strip().replace("\n","")}
			
 
				-                    scrutinize_dict[tag_sign_].append(value)
			
 
				-                    if table[regulation_number_index_]:
			
 
				-                            if table[regulation_number_index_][0] == '3':
			
 
				-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				-                                scrutinize_Initial_title_len = 0
			
 
				-                                break
			
 
				-            elif scrutinize_page+2 == page_number[0] and scrutinize_sign:
			
 
				-                difference_value = scrutinize_Initial_title_len - title_len
			
 
				-                if scrutinize_Initial_title_len:
			
 
				-                    evaluation_factor_index -= difference_value
			
 
				-                    evaluation_criteria_index -= difference_value
			
 
				-                    weights_index -= difference_value
			
 
				-                for table in tables:
			
 
				-                    if not table[2]:
			
 
				-                        scrutinize_dict[tag_sign_][-1]['评分标准'] += table[3]
			
 
				-                        continue
			
 
				-                    if table[regulation_number_index_+1]: tag = table[regulation_number_index_+1]
			
 
				-                    elif self._scrutinize_judge(table[regulation_number_index_+2]): tag = table[regulation_number_index_+2]
			
 
				-                    else: tag = table[regulation_number_index_]
			
 
				-                    if tag: 
			
 
				-                        tag = tag.strip().replace("\n","")
			
 
				-                        tag = re.findall("[\u4e00-\u9fff]+", tag)[0]
			
 
				-                    if tag and self._scrutinize_judge(tag):
			
 
				-                        tag_sign_ = tag
			
 
				-                        if tag_sign_ not in scrutinize_dict: scrutinize_dict[tag_sign_] = []
			
 
				-                    evaluation_factor,evaluation_criteria,weights = table[evaluation_factor_index],table[evaluation_criteria_index],table[weights_index]
			
 
				-                    if not weights: value = {"评分因素":evaluation_factor.strip().replace("\n",""), "评分标准":evaluation_criteria.strip().replace("\n","")}
			
 
				-                    else: value = {"评分因素":evaluation_factor.strip().replace("\n",""),
			
 
				-                                "评分标准":evaluation_criteria.strip().replace("\n",""),
			
 
				-                                "权重":weights.strip().replace("\n","")}
			
 
				-                    scrutinize_dict[tag_sign_].append(value)
			
 
				-                    if table[regulation_number_index_]:
			
 
				-                            if table[regulation_number_index_][0] == '3':
			
 
				-                                scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				-                                scrutinize_Initial_title_len = 0
			
 
				-                                break
			
 
				+                        scrutinize_dict[tag_sign_].append(value)
			
 
				+                        if table[regulation_number_index_]:
			
 
				+                                if table[regulation_number_index_][0] == '3':
			
 
				+                                    scrutinize_dict = {key: value for key, value in scrutinize_dict.items() if value}
			
 
				+                                    scrutinize_Initial_title_len = 0
			
 
				+                                    break
			
 
				 
			
 
				-        pprint(scrutinize_dict)
			
 
				         return scrutinize_dict
			
 
				 
			
 
				 
			
 
				-
			
 
				-from fastapi import FastAPI
			
 
				-import uvicorn
			
 
				-app = FastAPI()
			
 
				-
			
 
				-@app.post('get_pre_review')
			
 
				-def get_pre_review():
			
 
				-    
			
 
				-    result = {
			
 
				-        "":""
			
 
				-    }
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-
			
 
				 if __name__ == '__main__':
			
 
				     path_list = []
			
 
				     for path_ in path_list:
			
--- a/extract_financial_report.py
+++ b/extract_financial_report.py
@@ -26,9 +26,22 @@ def is_price(word: str) -> bool:
 
				         return False
			
 
				 
			
 
				 
			
 
				-def extract_financial_report(path: str, year: int = None):
			
 
				-    instances = get_instances_by_title(path,
			
 
				-                                       ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)])
			
 
				+def extract_financial_report(path: str, year: int = None) -> list:
			
 
				+    """
			
 
				+    财报解析
			
 
				+
			
 
				+    Args:
			
 
				+        path:
			
 
				+        year:
			
 
				+
			
 
				+    Returns:
			
 
				+        results 
			
 
				+    """
			
 
				+    instances = get_instances_by_title(
			
 
				+        path,
			
 
				+        ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
			
 
				+    )
			
 
				+
			
 
				     results = []
			
 
				     ocr_agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
			
 
				     for item in instances:
			
@@ -138,12 +151,16 @@ if __name__ == '__main__':
 
				     os.environ["TRANSFORMERS_OFFLINE"] = '1'
			
 
				 
			
 
				     y = datetime.datetime.now().year
			
 
				-    print(extract_financial_report(
			
 
				-        '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
			
 
				-        # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
			
 
				-        # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-        2022
			
 
				-    ))
			
 
				+
			
 
				+    print(
			
 
				+        extract_financial_report(
			
 
				+            '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
			
 
				+            # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
			
 
				+            # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				+            2022
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				     # start = time.time()
			
 
				     # fs = scan_dir('/home/zzh/ocr/pdf/', 'pdf')
			
 
				     #
			
--- a/extract_price.py
+++ b/extract_price.py
@@ -1,3 +1,7 @@
 
				+import os
			
 
				+os.environ['TRANSFORMERS_OFFLINE'] = '1'
			
 
				+os.environ['HF_DATASETS_OFFLINE'] = '1'
			
 
				+
			
 
				 from re import findall
			
 
				 from typing import List
			
 
				 
			
@@ -55,16 +59,38 @@ def match_quality(text: str) -> List[str]:
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
			
 
				-                             '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-                             match_price_zhs)
			
 
				-    price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
			
 
				-                             '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-                             match_price_num)
			
 
				-    duration = get_instance(['投标函', '开标一览表'], ['工期日历天'],
			
 
				-                            '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-                            match_duration)
			
 
				-    quality = get_instance(['投标函', '开标一览表'], ['工程质量'],
			
 
				-                           '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-                           match_quality)
			
 
				+    from pprint import pprint
			
 
				+
			
 
				+    pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
			
 
				+    price_zhs = get_instance(
			
 
				+        title_instances=['投标函', '开标一览表'],
			
 
				+        content_instances=['人民币投标总报价'],
			
 
				+        pdf_path,
			
 
				+        match_price_zhs
			
 
				+    )
			
 
				+    price_num = get_instance(
			
 
				+        title_instances=['投标函', '开标一览表'],
			
 
				+        content_instances=['人民币投标总报价'],
			
 
				+        pdf_path,
			
 
				+        match_price_num
			
 
				+    )
			
 
				+    duration = get_instance(
			
 
				+        title_instances=['投标函', '开标一览表'],
			
 
				+        content_instances=['工期日历天'],
			
 
				+        pdf_path,
			
 
				+        match_duration
			
 
				+    )
			
 
				+    quality = get_instance(
			
 
				+        title_instances=['投标函', '开标一览表'],
			
 
				+        content_instances=['工程质量'],
			
 
				+        pdf_path,
			
 
				+        match_quality
			
 
				+    )
			
 
				     valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
			
 
				+    pprint({
			
 
				+        "price_zhs": price_zhs,
			
 
				+        "price_num": price_num,
			
 
				+        "duration": duration,
			
 
				+        "quality": quality,
			
 
				+        "valid": valid
			
 
				+    })
			
--- a/get_info.py
+++ b/get_info.py
@@ -2,7 +2,7 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-11 13:43:14
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-08-08 17:07:49
			
 
				+# @Last Modified time: 2024-08-27 14:50:15
			
 
				 
			
 
				 # import os
			
 
				 
			
@@ -80,6 +80,7 @@ import re
 
				 import json
			
 
				 from io import BytesIO
			
 
				 from pprint import pprint
			
 
				+from typing import Optional
			
 
				 
			
 
				 # 第三方包导入
			
 
				 import cv2
			
@@ -102,17 +103,41 @@ import camelot
 
				 # 自定义包导入
			
 
				 from tools import RefPageNumberResolver
			
 
				 
			
 
				+
			
 
				+PIL_ERROR_MESSAGE = "No module named 'PIL', please run 'pip install pillow'"
			
 
				+
			
 
				+
			
 
				 HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价（元）', '含税合价（元）', '条款号', '评分因素', '评分标准', '页码'})
			
 
				 
			
 
				 
			
 
				+pattern_1 = re.compile(r'^\d(\d*\.?\d*)+\d(%)?')
			
 
				+pattern_2 = re.compile('^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\（\(][一二三四五六七八九十]+[\）\)]')
			
 
				+pattern_3 = re.compile('^附录|^参考文献|^附表')
			
 
				+
			
 
				+
			
 
				 def is_title(line: str) -> bool:
			
 
				+    """
			
 
				+    判断某行文本释放为标题
			
 
				+
			
 
				+    Args:
			
 
				+        line: 文本行
			
 
				+
			
 
				+    Results:
			
 
				+        是否是标题
			
 
				+    """
			
 
				+    # if re.fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line.strip()):
			
 
				+    if pattern_1.fullmatch(line.strip()):
			
 
				+        return False
			
 
				+
			
 
				     # title_word = re.findall('^[（\(][一二三四五六七八九十]+[\)）]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
			
 
				-    title_word = re.findall('^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\（\(][一二三四五六七八九十]+[\）\)]', line.strip())
			
 
				+    title_word = pattern_2.findall(line.strip())
			
 
				     if title_word:
			
 
				         return True
			
 
				-    title_word = re.findall('^附录|^参考文献|^附表', line.strip())
			
 
				+
			
 
				+    title_word = pattern_3.findall(line.strip())
			
 
				     if title_word:
			
 
				         return True
			
 
				+
			
 
				     return False
			
 
				 
			
 
				 def export_image(image: LTImage, path: str) -> str:
			
@@ -332,6 +357,38 @@ class PdfExtractAttr(object):
 
				         self.detail_df = None
			
 
				         self.outlines = None
			
 
				 
			
 
				+    def parse_title(self) -> list:
			
 
				+        """
			
 
				+        标题解析
			
 
				+        """
			
 
				+        texts = []
			
 
				+
			
 
				+        for page_number, page_layout in enumerate(extract_pages(self.file_path)):
			
 
				+            title_index = 0
			
 
				+            for element in page_layout:
			
 
				+                if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
			
 
				+                    text = element.get_text().strip()
			
 
				+                    if text and (is_title(text) or element.height > 15):
			
 
				+                        texts.append({
			
 
				+                            'index': title_index,
			
 
				+                            'page_number': page_number,
			
 
				+                            'bbox': element.bbox,
			
 
				+                            'text': text
			
 
				+                        })
			
 
				+                        title_index += 1
			
 
				+
			
 
				+        results = []
			
 
				+
			
 
				+        for i, text in enumerate(texts):
			
 
				+            results.append({
			
 
				+                'title': text['text'],
			
 
				+                'index': text['index'],
			
 
				+                'page_number': text['page_number'],
			
 
				+                'seq_num': i
			
 
				+            })
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				     def can_merge_lines(self, line1, line2) -> bool:
			
 
				         """判断两行文本是否可以合并为一段
			
 
				         """
			
@@ -468,7 +525,6 @@ class PdfExtractAttr(object):
 
				                     # 计算页码
			
 
				                     page_n = int(re.findall('\d+$', line).pop())
			
 
				                     # 添加到目录结构中
			
 
				-                    # directory_structure.append({
			
 
				                     results.append({
			
 
				                         "level": indent_level,
			
 
				                         "title": title,
			
@@ -481,6 +537,8 @@ class PdfExtractAttr(object):
 
				         return results
			
 
				 
			
 
				     def extract_content(self, content_path: str = None) -> list:
			
 
				+        self.content = []
			
 
				+
			
 
				         with pdfplumber.open(self.file_path) as pdf:
			
 
				             for page in pdf.pages:
			
 
				                 self.content.append({
			
@@ -535,7 +593,7 @@ class PdfExtractAttr(object):
 
				 
			
 
				         return results
			
 
				 
			
 
				-    def parse_text(self) -> None:
			
 
				+    def parse_text(self, text_path: Optional[str] = None) -> None:
			
 
				         """文本解析
			
 
				         """
			
 
				         for page_number, page_layout in enumerate(extract_pages(self.file_path)):
			
@@ -569,8 +627,15 @@ class PdfExtractAttr(object):
 
				                         'text': element.get_text().strip(),
			
 
				                         'is_table_name': element.get_text().strip().endswith('表')
			
 
				                     })
			
 
				+
			
 
				+        if text_path:
			
 
				+            with open(text_path, 'w', encoding='utf-8') as fp:
			
 
				+                json.dump(self.details, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				         self.detail_df = pd.DataFrame(self.details)
			
 
				 
			
 
				+        return self.details
			
 
				+
			
 
				     def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
			
 
				         """尝试将表添加到结果列中，有两种情况，直接添加一个新表；拼接最后一个表
			
 
				         @table
			
@@ -607,6 +672,8 @@ class PdfExtractAttr(object):
 
				     def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
			
 
				         """表格解析
			
 
				         """
			
 
				+        self.tables = []
			
 
				+
			
 
				         if self.detail_df == None:
			
 
				             self.parse_text()
			
 
				 
			
@@ -660,6 +727,7 @@ class PdfExtractAttr(object):
 
				         return self.tables
			
 
				 
			
 
				 
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     pdf_path = './投标文件-修改版9-5-1-1.pdf'
			
 
				     # pdf_path = './南方电网数字研究院有限公司.pdf'
			
--- a/instance_locate.py
+++ b/instance_locate.py
@@ -1,4 +1,4 @@
 
				-from typing import List
			
 
				+from typing import List, Optional
			
 
				 from pdfminer.high_level import extract_pages
			
 
				 from pdfminer.layout import LTFigure, LTImage, LTTextBoxHorizontal
			
 
				 from pprint import pprint
			
@@ -46,51 +46,65 @@ def parse_pages(pdf_path: str, text_path: str, image_dir: str, start_page: int,
 
				         json.dump(texts, fp, indent=4, ensure_ascii=False)
			
 
				 
			
 
				 
			
 
				-def get_instances_by_title(path: str, instances: List[str]):
			
 
				+def get_instances_by_title(path: Optional[str] = None, title_list: List[dict], table_list: List[dict], instances: List[str] = ['近年财务状况表']):
			
 
				     """
			
 
				     Get all tables and figures of given title
			
 
				+
			
 
				+    Args:
			
 
				+        path:
			
 
				+        title_list: PDF 标题
			
 
				+        table_list: PDF 表格
			
 
				+        instances:
			
 
				+
			
 
				+    Returns:
			
 
				+        results
			
 
				     """
			
 
				 
			
 
				-    # path = './投标文件-修改版9-5-1-1.pdf'
			
 
				-    # instances = ['近年财务状况表']
			
 
				-    file = PdfExtractAttr(file_path=path)
			
 
				-    print('解析PDF文字中')
			
 
				-    file.parse_text()
			
 
				-    # title = file.parse_outline()
			
 
				-    print('解析PDF标题中')
			
 
				-    all_title = parse_title(path)
			
 
				-    # all_text = file.parse_text()  # remain for external parse
			
 
				-
			
 
				-    print('分析标题中')
			
 
				-    title_sims = similarity_filter(similar_match(all_title, instances, key='title'), 0.5)
			
 
				+    title_sims = similarity_filter(similar_match(title_list, instances, key='title'), 0.5)
			
 
				+
			
 
				     title_f = [i for i in title_sims]
			
 
				+
			
 
				     results = []
			
 
				+
			
 
				     for i in title_f:
			
 
				         try:
			
 
				-            i['end_page'] = all_title[i['seq_num'] + 1]['page_number'] - 1
			
 
				+            i['end_page'] = title_list[i['seq_num'] + 1]['page_number'] - 1
			
 
				             if i['end_page'] <= i['page_number']:
			
 
				                 continue
			
 
				-            # i['end_page'] = all_title[i['seq_num']]['page_number'] + 5  # for debug
			
 
				         except IndexError:
			
 
				             i['end_page'] = float('inf')
			
 
				 
			
 
				-        image_loc = os.path.join(os.path.dirname(path), 'images')
			
 
				+        image_loc = os.path.join(os.path.dirname(path), 'extracted_images')
			
 
				+
			
 
				         if not os.path.exists(image_loc):
			
 
				             os.makedirs(image_loc)
			
 
				+
			
 
				         print('解析标题:\t{}'.format(i['title']))
			
 
				+
			
 
				         print('解析图片中')
			
 
				-        parse_pages(path, os.path.join(os.path.dirname(path),
			
 
				-                                       '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
			
 
				-                    image_loc, i['page_number'], i['end_page'], file.total_page)
			
 
				 
			
 
				-        table_loc = os.path.join(os.path.dirname(path),
			
 
				-                                 '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
			
 
				+        parse_pages(
			
 
				+            path,
			
 
				+            os.path.join(os.path.dirname(path), '{}_texts_{}_{}.json'.format(i['title'], i['page_number'], i['index'])),
			
 
				+            image_loc,
			
 
				+            i['page_number'],
			
 
				+            i['end_page'],
			
 
				+            file.total_page
			
 
				+        )
			
 
				+
			
 
				+        table_loc = os.path.join(os.path.dirname(path), '{}_tables_{}_{}.json'.format(i['title'], i['page_number'], i['index']))
			
 
				+
			
 
				         print('解析表格中')
			
 
				+
			
 
				         tables = file.parse_table(start=i['page_number'], end=i['end_page'])
			
 
				+
			
 
				         i['tables'] = tables
			
 
				+
			
 
				         with open(table_loc, 'w', encoding='utf-8') as fp:
			
 
				             json.dump(tables, fp, indent=4, ensure_ascii=False)
			
 
				+
			
 
				         i.update({'table_loc': table_loc, 'image_loc': image_loc})
			
 
				+
			
 
				         results.append(i)
			
 
				 
			
 
				     return results
			
--- a/matcher.py
+++ b/matcher.py
@@ -2,7 +2,11 @@
 
				 # @Author: privacy
			
 
				 # @Date:   2024-06-27 09:33:01
			
 
				 # @Last Modified by:   privacy
			
 
				-# @Last Modified time: 2024-06-27 14:44:43
			
 
				+# @Last Modified time: 2024-08-23 12:10:09
			
 
				+import os
			
 
				+os.environ['TRANSFORMERS_OFFLINE'] = '1'
			
 
				+os.environ['HF_DATASETS_OFFLINE'] = '1'
			
 
				+
			
 
				 import torch
			
 
				 import numpy as np
			
 
				 import pandas as pd
			
@@ -29,18 +33,18 @@ class Matcher:
 
				         return pd.Series([most_similar_keyword, max(similarities)])
			
 
				 
			
 
				     def get_embedding(self, text: str):
			
 
				-        encoded_input = tokenizer(text, return_tensors='pt')
			
 
				+        encoded_input = self.tokenizer(text, return_tensors='pt')
			
 
				         with torch.no_grad():
			
 
				-            output = model(**encoded_input)
			
 
				+            output = self.model(**encoded_input)
			
 
				         text_embedding = np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0)
			
 
				         return text_embedding
			
 
				     
			
 
				     def get_embeddings(self, text_list: list) -> list:
			
 
				         text_embeddings = []
			
 
				         for text in text_list:
			
 
				-            encoded_input = tokenizer(text, return_tensors='pt')
			
 
				+            encoded_input = self.tokenizer(text, return_tensors='pt')
			
 
				             with torch.no_grad():
			
 
				-                output = model(**encoded_input)
			
 
				+                output = self.model(**encoded_input)
			
 
				             text_embeddings.append(np.mean(output.last_hidden_state.mean(dim=1).numpy(), axis=0))
			
 
				         return text_embeddings
			
 
				 
			
--- a/ocr_api.py
+++ b/ocr_api.py
@@ -9,7 +9,7 @@ class OcrAgent:
 
				     def __init__(self, url):
			
 
				         self.url = url
			
 
				 
			
 
				-    def get_content(self, image_path):
			
 
				+    def get_content(self, image_path: str) -> dict:
			
 
				         try:
			
 
				             with open(image_path, 'rb') as image_file:
			
 
				                 files = {"file": ("image.jpg", image_file, "image/jpeg")}
			
@@ -38,5 +38,6 @@ def find_current_row(ocr_result: List[dict], top: int, bottom: int, float_range:
 
				 if __name__ == '__main__':
			
 
				     agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
			
 
				     res = agent.get_content(
			
 
				-        os.path.join('/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/images', 'image_page_1131_0.png'))
			
 
				+        os.path.join('D:\\desktop\\三峡水利\\data\\projects\\2022-2025年度三峡电站9台机组检修密封加工制作重新招标\\投标\\东方电气\\extracted_images\\', 'image_page_27_1.jpg'))
			
 
				+    print(res)
			
 
				     pass
			
--- a/project_loc.py
+++ b/project_loc.py
@@ -1,7 +1,11 @@
 
				-from typing import List
			
 
				-from get_info import PdfExtractAttr
			
 
				-# from scan_dir import scan_dir
			
 
				 import time
			
 
				+import json
			
 
				+from typing import (
			
 
				+    List,
			
 
				+    Optional
			
 
				+)
			
 
				+
			
 
				+from get_info import PdfExtractAttr
			
 
				 
			
 
				 
			
 
				 ins = ['合同金额', '合同价格', '发包人名称']
			
@@ -14,10 +18,32 @@ def batch_bool(instances: List[str], text: str) -> bool:
 
				     return False
			
 
				 
			
 
				 
			
 
				-def extract_project(path: str, instances: List[str]):
			
 
				-    agent = PdfExtractAttr(file_path=path)
			
 
				-    tables = agent.parse_table_pro()
			
 
				+def extract_project(instances: List[str], table_dict: Optional[dict] = None, table_path: Optional[str] = None, pdf_path: Optional[str] = None) -> list:
			
 
				+    """
			
 
				+    从表格中抽取项目业绩
			
 
				+
			
 
				+    Args:
			
 
				+        instance:   抽取的字段
			
 
				+        table_dict: json表格
			
 
				+        table_path: 表格文件路径
			
 
				+        pdf_path:   pdf源文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        res 项目业绩表
			
 
				+    """
			
 
				+    if table_dict:
			
 
				+        tables = table_dict
			
 
				+    elif table_path:
			
 
				+        with open(table_path, 'r', encoding='utf-8') as jsonfile:
			
 
				+            tables = json.load(jsonfile)
			
 
				+    elif pdf_path:
			
 
				+        agent = PdfExtractAttr(file_path=pdf_path)
			
 
				+        tables = agent.parse_table_pro()
			
 
				+    else:
			
 
				+        raise ValueError("请输入需要解析的文件！")
			
 
				+
			
 
				     res = []
			
 
				+
			
 
				     for table in tables:
			
 
				 
			
 
				         tab = [[j.replace('\n', '').replace(' ', '') for j in i] for i in table['table']]
			
@@ -26,7 +52,6 @@ def extract_project(path: str, instances: List[str]):
 
				         rows = [row[0] for row in tab]
			
 
				 
			
 
				         for i in rows:
			
 
				-            # if '合同金额' in i or '合同价格' in i or '发包人名称' in i:
			
 
				             if batch_bool(instances, i):
			
 
				                 res.append({
			
 
				                     "page_numbers": pages,
			
@@ -38,16 +63,19 @@ def extract_project(path: str, instances: List[str]):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # fs = scan_dir('/home/zzh/ocr/pdf', 'pdf')
			
 
				-    # start = time.time()
			
 
				-    # for f in (fs[:]):
			
 
				-    #     try:
			
 
				-    #         print(f)
			
 
				-    #         print(extract_project(f, ins))
			
 
				-    #         print('\n*********Runtime {} s *********\n'.format(time.time() - start))
			
 
				-    #     except BaseException as e:
			
 
				-    #         print('Something wrong')
			
 
				-    #         print(e)
			
 
				-
			
 
				-    print(extract_project(r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf', ['合同金额', '合同价格', '发包人名称']))
			
 
				+    from pprint import pprint
			
 
				+
			
 
				+    file = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司.pdf'
			
 
				+
			
 
				+    table_path = r'./2022年度工程类-公招采购资料/基于物联网技术的三峡坝区智慧仓储研究与建设/1南方电网数字电网研究院有限公司_T221100130645%2F01整本文件/MainPdfFile/南方电网数字研究院有限公司-table.json'
			
 
				+
			
 
				+    with open(table_path, 'r', encoding='utf-8') as jsonfile:
			
 
				+        tables = json.load(jsonfile)
			
 
				+
			
 
				+    pprint(
			
 
				+        extract_project(
			
 
				+            instances=['合同金额', '合同价格', '发包人名称'],
			
 
				+            table_dict=tables
			
 
				+        )
			
 
				+    )
			
 
				 
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ transformers==4.41.2
 
				 textrank4zh==0.3
			
 
				 jieba==0.42.1
			
 
				 camelot==0.20.1
			
 
				-PyMuPDF==1.24.9
			
 
				+PyMuPDF==1.24.9
			
 
				+celery==5.4.0
			
 
				+redis==5.0.8
			
--- a/text_extractor.py
+++ b/text_extractor.py
@@ -6,39 +6,43 @@ from pdfminer.pdfparser import PDFParser
 
				 from matcher import Matcher
			
 
				 from get_info import PdfExtractAttr, is_title
			
 
				 from typing import Callable, Union, List, Tuple, Dict
			
 
				-from re import fullmatch
			
 
				 from tqdm import tqdm
			
 
				 import pandas as pd
			
 
				 
			
 
				 
			
 
				-def absolute_not_title(line: str) -> bool:
			
 
				-    if fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line):
			
 
				-        return True
			
 
				-    else:
			
 
				-        return False
			
 
				 
			
 
				+# def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
			
 
				+#     """
			
 
				+#     标题解析
			
 
				 
			
 
				-def parse_title(pdf_path: str) -> list[dict[str, int | str | tuple[float, float, float, float]]]:
			
 
				-    texts = []
			
 
				-    for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
			
 
				-                                         total=resolve1(PDFDocument(
			
 
				-                                             PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
			
 
				-                                         ):
			
 
				-        title_index = 0
			
 
				-        for element in page_layout:
			
 
				-            if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
			
 
				-                text = element.get_text().strip()
			
 
				-                if text and (is_title(text) or element.height > 15) and (not absolute_not_title(text)):
			
 
				-                    texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
			
 
				-                    title_index += 1
			
 
				-    results = []
			
 
				-    for i, text in enumerate(texts):
			
 
				-        results.append({'title': text['text'],
			
 
				-                        'index': text['index'],
			
 
				-                        'page_number': text['page_number'],
			
 
				-                        'seq_num': i
			
 
				-                        })
			
 
				-    return results
			
 
				+#     Args:
			
 
				+#         pdf_path: PDF文件路径
			
 
				+
			
 
				+#     Returns:
			
 
				+#         results
			
 
				+#     """
			
 
				+#     texts = []
			
 
				+
			
 
				+#     for page_number, page_layout in tqdm(enumerate(extract_pages(pdf_path)),
			
 
				+#                                          total=resolve1(PDFDocument(
			
 
				+#                                              PDFParser(open(pdf_path, 'rb'))).catalog['Pages'])['Count']
			
 
				+#                                          ):
			
 
				+#         title_index = 0
			
 
				+#         for element in page_layout:
			
 
				+#             if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
			
 
				+#                 text = element.get_text().strip()
			
 
				+#                 if text and (is_title(text) or element.height > 15):
			
 
				+#                     texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
			
 
				+#                     title_index += 1
			
 
				+#     results = []
			
 
				+
			
 
				+#     for i, text in enumerate(texts):
			
 
				+#         results.append({'title': text['text'],
			
 
				+#                         'index': text['index'],
			
 
				+#                         'page_number': text['page_number'],
			
 
				+#                         'seq_num': i
			
 
				+#                         })
			
 
				+#     return results
			
 
				 
			
 
				 
			
 
				 def pagination_texts(contents: List[dict], start: int, end: int = None) -> Tuple[Dict, List[str]]:
			
@@ -100,27 +104,56 @@ def similar_match(data: List[dict], instances: List[str], key: str) -> {}:
 
				     return max_sim_rows.to_dict(orient='records')
			
 
				 
			
 
				 
			
 
				-def get_instance(title_instances: List[str], content_instances: List[str], pdf: str,
			
 
				+def get_instance(title_instances: List[str],
			
 
				+                 content_instances: List[str],
			
 
				+                 pdf_path: str,
			
 
				                  extractor: Union[Callable[[str, float], List[str]], Callable[[str], List[str]]],
			
 
				-                 page_bias: int = 1, similarity: float = None):
			
 
				-    file = PdfExtractAttr(file_path=pdf)
			
 
				-    # titles = file.parse_outline()
			
 
				-    titles = parse_title(pdf)
			
 
				+                 page_bias: int = 1,
			
 
				+                 similarity: float = None
			
 
				+                ):
			
 
				+    """
			
 
				+    Args:
			
 
				+        title_instances
			
 
				+        content_instances
			
 
				+        file_path
			
 
				+        extractor
			
 
				+        page_bias
			
 
				+        similarity
			
 
				+
			
 
				+    Returns:
			
 
				+        results
			
 
				+    """
			
 
				+    file = PdfExtractAttr(file_path=pdf_path)
			
 
				+    titles = file.parse_title()
			
 
				     texts = file.parse_text()
			
 
				 
			
 
				-    title_sims = similarity_filter(similar_match(titles, title_instances, key='title'), similarity)
			
 
				+    title_sims = similarity_filter(
			
 
				+        similar_match(
			
 
				+            titles,
			
 
				+            title_instances,
			
 
				+            key='title'
			
 
				+        ),
			
 
				+        similarity
			
 
				+    )
			
 
				+
			
 
				     results = []
			
 
				+
			
 
				     for i in title_sims:
			
 
				         current_page = i['page_number']
			
 
				         _, text = pagination_texts(texts, current_page, current_page + page_bias)
			
 
				         results.extend(extract_from_texts(text, extractor, content_instances))
			
 
				+
			
 
				     return results
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    # price_zhs = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
			
 
				-    #                          '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				-    #                          match_price_zhs)
			
 
				+    pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/投标文件/湖北建新建设工程有限公司_T221100130348%2F01整本文件/MainPdfFile/投标文件-修改版9-5-1-1.pdf'
			
 
				+    price_zhs = get_instance(
			
 
				+        title_instances=['投标函', '开标一览表'],
			
 
				+        content_instances=['人民币投标总报价'],
			
 
				+        pdf_path=pdf_path,
			
 
				+        extractor=match_price_zhs
			
 
				+    )
			
 
				     # price_num = get_instance(['投标函', '开标一览表'], ['人民币投标总报价'],
			
 
				     #                          '/Users/zelate/Codes/pvas/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
			
 
				     #                          match_price_num)
			
@@ -133,4 +166,5 @@ if __name__ == '__main__':
 
				     # valid = rmb_to_digit(price_zhs[0][0][0]) == price_num[0][0][0][1:]
			
 
				     # test = rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')
			
 
				     # valid = (rmb_to_digit('壹仟肆佰贰拾万捌仟玖佰陆拾柒元叁角陆分元')) == '14208967.36'
			
 
				+    print(price_zhs)
			
 
				     pass