123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-09-30 11:29:16
- import os
- import re
- import json
- from enum import Enum, auto
- from typing import Any, Optional, List
- import pandas as pd
- from pdfminer.pdftypes import PDFObjRef
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
- import pdfplumber
- def comment_clean(comment: str):
- '''
- 对LLM返回结果进行清洗
- Args:
- comment: LLM返回结果
-
- Returns:
- comment: 清洗后的LLM返回结果
- '''
- if not comment:
- return comment
-
- # score_rating = re.search(r"\'([A-D])\'",comment)
- comment = re.sub(r'=<LevelEnum\.[A-D]:|>', '', comment)
- return comment
- def check_scan_pdf(file_path: str) -> bool:
- """
- 测试PDF文件是否为扫描件
- Args:
- file_path: 文件地址
- Returns:
- bool: 是否为扫描件
- """
- probability_page = 0
- with pdfplumber.open(file_path) as pdf:
- page_num = len(pdf.pages)
- for page in pdf.pages:
- content = page.extract_text()
- if len(content) > 50:
- probability_page += 1
- if (probability_page / page_num) > 0.1:
- return False
- return True
- def num_to_chinese(num: int) -> str:
- """
- 数字转中文
- Args:
- num: 待转数字
- Returns:
- 数字的中文表示
- """
- chinese_num = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
- chinese_unit = ['', '十', '百', '千', '万']
- if num == 0:
- return chinese_num[0]
- res = ''
- unit_index = 0
- while num > 0:
- digit = num % 10
- if digit != 0:
- res = chinese_num[digit] + chinese_unit[unit_index] + res
- elif not res.startswith(chinese_num[0]):
- res = chinese_num[0] + res
- num //= 10
- unit_index += 1
- return res.replace('一十', '十').rstrip('零')
- def chinese_to_num(chinese_num: str) -> int:
- """
- 中文转数字
- Args:
- chinese_num: 待转中文
- Returns:
- 数字
- """
- number_map = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
- # 单位映射
- unit_map = {'十': 10, '百': 100, '千': 1000, '万': 10000}
- output = 0
- unit = 1
- num = 0
- for index, cn_num in enumerate(chinese_num):
- if cn_num in number_map:
- # 数字
- num = number_map[cn_num]
- # 最后的个位数字
- if index == len(chinese_num) - 1:
- output = output + num
- elif cn_num in unit_map:
- # 单位
- unit = unit_map[cn_num]
- # 累加
- output = output + num * unit
- num = 0
- else:
- raise ValueError(f"{cn_num} 不在转化范围内")
- return output
- def next_chinese_num(chinese_num: str) -> str:
- """
- 中文数字加一
- Args:
- chinese_num: 待加中文数字
- Returns:
- 加一后的中文
- """
- num = chinese_to_num(chinese_num)
- return num_to_chinese(num + 1)
- def filter_images(image_list: list, start_page: int, end_page: int) -> List[dict]:
- """
- 从已解析的图片中筛选出指定页面的图片
- Args:
- image_list: 图片列表
- start_page: 起始页码
- end_page: 终止页码
- Returns:
- 从起始页码到终止页码间的图片列表
- """
- df = pd.DataFrame(image_list)
- return df.query(f''' {start_page} <= page_number <= {end_page} ''').to_dict(orient='records')
- def filter_tables(table_list: list, start_page: int, end_page: int) -> List[dict]:
- """
- 从已解析的表格中筛选出指定页面的表格
- Args:
- table_list: 表格列表
- start_page: 起始页码
- end_page: 终止页码
- Returns:
- 从起始页码到终止页码间的表格列表
- """
- return [table for table in table_list if (start_page <= min(table['page_numbers'])) and (end_page >= max(table['page_numbers']))]
- def filter_content(content_list: list, start_page: int, end_page: int) -> List[dict]:
- """
- 从已解析的内容中筛选出指定页面的内容
- Args:
- content_list: 内容列表
- start_page: 起始页码
- end_page: 终止页码
- Returns:
- 从起始页码到终止页码间的内容列表
- """
- return [content for content in content_list if (start_page <= content['page_number']) and (end_page >= content['page_number'])]
- def rmb_to_digit(rmb_str: str):
- digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
- unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
- digit = 0
- total = 0
- tmp = 0
- for char in rmb_str:
- if char in digit_map:
- digit = digit_map[char]
- elif char in unit_map:
- if digit + tmp:
- total += (tmp + digit) * unit_map[char]
- tmp = digit = 0
- else:
- total *= unit_map[char]
- else:
- tmp = digit
- total += tmp + digit
- return '{:.2f}'.format(total)
- def match_price_zhs(text: str) -> List[str]:
- pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
- r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
- temp = re.findall(pattern, text)
- for i in range(len(temp)):
- if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
- temp[i] = temp[i][:-1]
- return temp
- def match_price_num(text: str) -> List[str]:
- pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
- r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
- r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
- r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
- return re.findall(pattern, text)
- def match_duration(text: str) -> List[str]:
- pattern = r"[1-9]+[\d]日历天"
- return re.findall(pattern, text)
- def match_quality(text: str) -> List[str]:
- pattern = r"工程质量.+"
- return re.findall(pattern, text)
- class PDFRefType(Enum):
- """PDF reference type."""
- PDF_OBJ_REF = auto()
- DICTIONARY = auto()
- LIST = auto()
- NAMED_REF = auto()
- UNK = auto() # fallback
- class RefPageNumberResolver:
- """PDF Reference to page number resolver.
- .. note::
- Remote Go-To Actions (see 12.6.4.3 in
- `https://www.adobe.com/go/pdfreference/`__)
- are out of the scope of this resolver.
- Attributes:
- document (:obj:`pdfminer.pdfdocument.PDFDocument`):
- The document that contains the references.
- objid_to_pagenum (:obj:`dict[int, int]`):
- Mapping from an object id to the number of the page that contains
- that object.
- """
- def __init__(self, document: PDFDocument):
- self.document = document
- # obj_id -> page_number
- self.objid_to_pagenum: dict[int, int] = {
- page.pageid: page_num
- for page_num, page in enumerate(PDFPage.create_pages(document), 1)
- }
- @classmethod
- def get_ref_type(cls, ref: Any) -> PDFRefType:
- """Get the type of a PDF reference."""
- if isinstance(ref, PDFObjRef):
- return PDFRefType.PDF_OBJ_REF
- elif isinstance(ref, dict) and "D" in ref:
- return PDFRefType.DICTIONARY
- elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
- return PDFRefType.LIST
- elif isinstance(ref, bytes):
- return PDFRefType.NAMED_REF
- else:
- return PDFRefType.UNK
- @classmethod
- def is_ref_page(cls, ref: Any) -> bool:
- """Check whether a reference is of type '/Page'.
- Args:
- ref (:obj:`Any`):
- The PDF reference.
- Returns:
- :obj:`bool`: :obj:`True` if the reference references
- a page, :obj:`False` otherwise.
- """
- return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
- def resolve(self, ref: Any) -> Optional[int]:
- """Resolve a PDF reference to a page number recursively.
- Args:
- ref (:obj:`Any`):
- The PDF reference.
- Returns:
- :obj:`Optional[int]`: The page number or :obj:`None`
- if the reference could not be resolved (e.g., remote Go-To
- Actions or malformed references).
- """
- ref_type = self.get_ref_type(ref)
- if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
- return self.objid_to_pagenum.get(ref.objid)
- elif ref_type is PDFRefType.PDF_OBJ_REF:
- return self.resolve(ref.resolve())
- if ref_type is PDFRefType.DICTIONARY:
- return self.resolve(ref["D"])
- if ref_type is PDFRefType.LIST:
- # Get the PDFObjRef in the list (usually first element).
- return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
- if ref_type is PDFRefType.NAMED_REF:
- return self.resolve(self.document.get_dest(ref))
- return None # PDFRefType.UNK
- class BaseMethods:
- ''' base methods class
- '''
- def __init__(self) -> None:
- pass
- def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
- ''' 读取xls文件方法
- '''
- return pd.read_excel(file_path, sheet_name=sheetname)
- def json_read(self, file_path: str):
- ''' 读取json文件方法
- '''
- with open(file_path, "r", encoding='utf-8') as fp:
- return json.load(fp)
- def save_file(self, save_data: list, save_path: str, file_format: str):
- ''' 保存文件
- '''
- if file_format == "json":
- with open(save_path, 'w', encoding='utf-8') as sf:
- sf.write(json.dumps(save_data, ensure_ascii=False))
- elif file_format == "xlsx" or file_format == "xls":
- with pd.ExcelWriter(save_path) as fp:
- save_data.to_excel(fp, sheet_name="Sheet1")
- elif file_format == 'txt':
- with open(save_path, 'w', encoding='utf-8') as tx:
- for data in save_data:
- tx.write(data + "\n")
- def traverse_file(self, dirpath: str):
- '''
- 遍历文件夹下文件
- '''
- filename = tuple()
- for root, dir, files in os.walk(dirpath):
- for name in files:
- filename = filename.__add__((name,))
- return filename
- class TitleLevelJudge:
- def __init__(self, titles: List[str]):
- self.titles = titles
- self.levels = self.judge_title_level(self.titles)
- @classmethod
- def judge_title_level(cls, titles: List[str]) -> List[int]:
- """
- 判断标题的等级
- 规则1000. 默认第一个标题的等级为 1
- 往下遍历标题
- 判断标题是否在正则表达式中,如果在,使用 规则1100.,如果不在,使用 规则1200.
- 规则1100. 判断标题使用的正则表达式是否为上个标题使用的正则表达式,如果是,则使用 规则1110. 如果否则使用 规则1120.
- 规则1110. 当前标题和上一个标题在同一个等级
- 规则1120. 向上查找,标题等级依次降低,如果找到,则使用 规则1121., 如果提升,则使用 规则1122.
- 规则1121. 使用找的标题等级
- 规则1122. 标题等级提升
- 规则1200. 特殊标题,标题等级提升
- """
- # 定义用于提取标题结构的正则表达式
- patterns = [
- r'^第[一二三四五六七八九十百]+章', # 例如:“第一章”
- r'^第[一二三四五六七八九十百]+条', # 例如:“第一条”
- r'^第[一二三四五六七八九十百]+部分', # 例如:“第一部分”
- r'^第\d+章', # 例如:“第3章”
- r'^第 \d+ 章', # 例如:“第 3 章”
- r'^第\d+条', # 例如:“第3条”
- r'^第 \d+ 条', # 例如:“第 3 条”
- r'^第\d+部分', # 例如:“第3部分”
- r'^第 \d+ 部分', # 例如:“第 3 部分”
- r'^([一二三四五六七八九十百]+)', # 例如:“(一)”
- r'^([\d]+)', # 例如:“(1)”
- r'^[一二三四五六七八九十百]+、', # 例如:“一、”
- r'^[一二三四五六七八九十百]+)', # 例如:“一)”
- r'^[一二三四五六七八九十百]+\)', # 例如:“一)”
- r'^\d+、', # 例如:“1、”
- r'^\d+)', # 例如:“1)”
- r'^\d+\)', # 例如:“1)”
- r'^\d+-\d+', # 例如:“5-2”
- r'^\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.\d+', # 例如:“1.1.1.1”
- r'^\d+\.\d+\.\d+\.\d+\.\d+\.\d+\.\d+', # 例如:“1.1.1.1”
- r'^\d+\.\d+\.\d+\.\d+\.\d+\.\d+', # 例如:“1.1.1.1”
- r'^\d+\.\d+\.\d+\.\d+\.\d+', # 例如:“1.1.1.1”
- r'^\d+\.\d+\.\d+\.\d+', # 例如:“1.1.1.1”
- r'^\d+\.\d+\.\d+', # 例如:“1.1.1”
- r'^\d+\.\d+', # 例如:“1.1”
- r'^\d+\.', # 例如:“1.”
- r'^文件 [一二三四五六七八九十百]+', # 例如:“文件 一”
- r'^附件 [一二三四五六七八九十百]+', # 例如:“附件 一”
- r'^附录 [一二三四五六七八九十百]+', # 例如:“附录 一”
- r'^文件[一二三四五六七八九十百]+', # 例如:“文件一”
- r'^附件[一二三四五六七八九十百]+', # 例如:“附件一”
- r'^附录[一二三四五六七八九十百]+', # 例如:“附录一”
- r'^文件 \d', # 例如:“文件 1”
- r'^附件 \d', # 例如:“附件 1”
- r'^附录 \d', # 例如:“附录 1”
- r'^文件\d', # 例如:“文件1”
- r'^附件\d', # 例如:“附件1”
- r'^附录\d', # 例如:“附录1”
- r'^图', # 例如:“图:1”
- r'图$', # 例如:“示例图”
- r'^表', # 例如:“表:1”
- r'^附表', # 例如:“附表:1”
- r'表$', # 例如:“示例表”
- r'函$', # 例如:“合规承诺函”
- r'承诺书$', # 例如:“合规承诺书”
- r'证书$', # 例如:“投标人专利证书”
- r'专利$', # 例如:“发明专利”
- r'^[一二三四五六七八九十百]+', # 例如:“一”
- r'^\d+', # 例如:“1”
- r'.*?' # 任意匹配
- ]
- # 初始化标题等级列表
- level_list = []
- pattern_list = []
- # 遍历所有标题
- for title in titles:
- # 遍历所有结构模式
- for i, pattern in enumerate(patterns):
- if re.match(pattern, title):
- current_pattern = i + 1
- break
- # 规则1000. 默认第一个标题的等级为 1
- if not level_list:
- current_level = 1
- # 判断标题是否在正则表达式中
- elif current_pattern in pattern_list:
- # 规则1100. 判断标题使用的正则表达式是否为上个标题使用的正则表达式
- if current_pattern == pattern_list[-1]:
- # 当前标题和上一个标题在同一个等级
- current_level = level_list[-1]
- # 规则1120. 向上查找,标题等级依次降低
- else:
- # 上一个等级
- tl = level_list[-1]
- for p, l in zip(pattern_list[::-1], level_list[::-1]):
- if (current_pattern == p) and current_level < (tl + 12):
- current_level = l
- break
- # 规则1200. 特殊标题,标题等级提升
- else:
- current_level = level_list[-1] + 1
- # 将当前标题的等级添加到列表中
- pattern_list.append(current_pattern)
- level_list.append(current_level)
- return level_list
- def find_next_title(self, current_title: str) -> Optional[str]:
- # 获取当前标题的索引
- current_index = self.titles.index(current_title)
- # 从当前标题的下一个标题开始遍历
- for i in range(current_index + 1, len(self.titles)):
- # 如果下一个标题的等级小于等于当前标题的等级,则返回该标题
- if self.levels[i] <= self.levels[current_index]:
- return self.titles[i]
- # 如果没有找到满足条件的标题,则返回None
- return None
|