tools.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-04 10:21:48
  6. import os
  7. import re
  8. import json
  9. from enum import Enum, auto
  10. from typing import Any, Optional, List
  11. import pandas as pd
  12. from pdfminer.pdftypes import PDFObjRef
  13. from pdfminer.pdfdocument import PDFDocument
  14. from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
  15. import pdfplumber
  16. def check_scan_pdf(file_path: str) -> bool:
  17. """
  18. 测试PDF文件是否为扫描件
  19. Args:
  20. file_path: 文件地址
  21. Returns:
  22. bool: 是否为扫描件
  23. """
  24. probability_page = 0
  25. with pdfplumber.open(file_path) as pdf:
  26. page_num = len(pdf.pages)
  27. for page in pdf.pages:
  28. content = page.extract_text()
  29. if len(content) > 50:
  30. probability_page += 1
  31. if (probability_page / page_num) > 0.1:
  32. return False
  33. return True
  34. def num_to_chinese(num: int) -> str:
  35. """
  36. 数字转中文
  37. Args:
  38. num: 待转数字
  39. Returns:
  40. 数字的中文表示
  41. """
  42. chinese_num = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
  43. chinese_unit = ['', '十', '百', '千', '万']
  44. if num == 0:
  45. return chinese_num[0]
  46. res = ''
  47. unit_index = 0
  48. while num > 0:
  49. digit = num % 10
  50. if digit != 0:
  51. res = chinese_num[digit] + chinese_unit[unit_index] + res
  52. elif not res.startswith(chinese_num[0]):
  53. res = chinese_num[0] + res
  54. num //= 10
  55. unit_index += 1
  56. return res.replace('一十', '十').rstrip('零')
  57. def chinese_to_num(chinese_num: str) -> int:
  58. """
  59. 中文转数字
  60. Args:
  61. chinese_num: 待转中文
  62. Returns:
  63. 数字
  64. """
  65. number_map = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
  66. # 单位映射
  67. unit_map = {'十': 10, '百': 100, '千': 1000, '万': 10000}
  68. output = 0
  69. unit = 1
  70. num = 0
  71. for index, cn_num in enumerate(chinese_num):
  72. if cn_num in number_map:
  73. # 数字
  74. num = number_map[cn_num]
  75. # 最后的个位数字
  76. if index == len(chinese_num) - 1:
  77. output = output + num
  78. elif cn_num in unit_map:
  79. # 单位
  80. unit = unit_map[cn_num]
  81. # 累加
  82. output = output + num * unit
  83. num = 0
  84. else:
  85. raise ValueError(f"{cn_num} 不在转化范围内")
  86. return output
  87. def next_chinese_num(chinese_num: str) -> str:
  88. """
  89. 中文数字加一
  90. Args:
  91. chinese_num: 待加中文数字
  92. Returns:
  93. 加一后的中文
  94. """
  95. num = chinese_to_num(chinese_num)
  96. return num_to_chinese(num + 1)
  97. def filter_images(image_list: list, start_page: int, end_page: int) -> List[dict]:
  98. """
  99. 从已解析的图片中筛选出指定页面的图片
  100. Args:
  101. image_list: 图片列表
  102. start_page: 起始页码
  103. end_page: 终止页码
  104. Returns:
  105. 从起始页码到终止页码间的图片列表
  106. """
  107. df = pd.DataFrame(image_list)
  108. return df.query(f''' {start_page} <= page_number <= {end_page} ''').to_dict(orient='records')
  109. def filter_tables(table_list: list, start_page: int, end_page: int) -> List[dict]:
  110. """
  111. 从已解析的表格中筛选出指定页面的表格
  112. Args:
  113. table_list: 表格列表
  114. start_page: 起始页码
  115. end_page: 终止页码
  116. Returns:
  117. 从起始页码到终止页码间的表格列表
  118. """
  119. return [table for table in table_list if (start_page <= min(table['page_numbers'])) and (end_page >= max(table['page_numbers']))]
  120. def rmb_to_digit(rmb_str: str):
  121. digit_map = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}
  122. unit_map = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '万': 10000, '亿': 100000000}
  123. digit = 0
  124. total = 0
  125. tmp = 0
  126. for char in rmb_str:
  127. if char in digit_map:
  128. digit = digit_map[char]
  129. elif char in unit_map:
  130. if digit + tmp:
  131. total += (tmp + digit) * unit_map[char]
  132. tmp = digit = 0
  133. else:
  134. total *= unit_map[char]
  135. else:
  136. tmp = digit
  137. total += tmp + digit
  138. return '{:.2f}'.format(total)
  139. def match_price_zhs(text: str) -> List[str]:
  140. pattern = (r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟][壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]+"
  141. r"[壹,贰,叁,肆,伍,陆,柒,捌,玖,拾,佰,仟,元,角,万,分,百,整,零]")
  142. temp = re.findall(pattern, text)
  143. for i in range(len(temp)):
  144. if temp[i].endswith('整元') or temp[i].endswith('角元') or temp[i].endswith('分元') or temp[i].endswith('元元'):
  145. temp[i] = temp[i][:-1]
  146. return temp
  147. def match_price_num(text: str) -> List[str]:
  148. pattern = (r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|"
  149. r"ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR|"
  150. r"PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  151. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)")
  152. return re.findall(pattern, text)
  153. def match_duration(text: str) -> List[str]:
  154. pattern = r"[1-9]+[\d]日历天"
  155. return re.findall(pattern, text)
  156. def match_quality(text: str) -> List[str]:
  157. pattern = r"工程质量.+"
  158. return re.findall(pattern, text)
  159. class PDFRefType(Enum):
  160. """PDF reference type."""
  161. PDF_OBJ_REF = auto()
  162. DICTIONARY = auto()
  163. LIST = auto()
  164. NAMED_REF = auto()
  165. UNK = auto() # fallback
  166. class RefPageNumberResolver:
  167. """PDF Reference to page number resolver.
  168. .. note::
  169. Remote Go-To Actions (see 12.6.4.3 in
  170. `https://www.adobe.com/go/pdfreference/`__)
  171. are out of the scope of this resolver.
  172. Attributes:
  173. document (:obj:`pdfminer.pdfdocument.PDFDocument`):
  174. The document that contains the references.
  175. objid_to_pagenum (:obj:`dict[int, int]`):
  176. Mapping from an object id to the number of the page that contains
  177. that object.
  178. """
  179. def __init__(self, document: PDFDocument):
  180. self.document = document
  181. # obj_id -> page_number
  182. self.objid_to_pagenum: dict[int, int] = {
  183. page.pageid: page_num
  184. for page_num, page in enumerate(PDFPage.create_pages(document), 1)
  185. }
  186. @classmethod
  187. def get_ref_type(cls, ref: Any) -> PDFRefType:
  188. """Get the type of a PDF reference."""
  189. if isinstance(ref, PDFObjRef):
  190. return PDFRefType.PDF_OBJ_REF
  191. elif isinstance(ref, dict) and "D" in ref:
  192. return PDFRefType.DICTIONARY
  193. elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
  194. return PDFRefType.LIST
  195. elif isinstance(ref, bytes):
  196. return PDFRefType.NAMED_REF
  197. else:
  198. return PDFRefType.UNK
  199. @classmethod
  200. def is_ref_page(cls, ref: Any) -> bool:
  201. """Check whether a reference is of type '/Page'.
  202. Args:
  203. ref (:obj:`Any`):
  204. The PDF reference.
  205. Returns:
  206. :obj:`bool`: :obj:`True` if the reference references
  207. a page, :obj:`False` otherwise.
  208. """
  209. return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE
  210. def resolve(self, ref: Any) -> Optional[int]:
  211. """Resolve a PDF reference to a page number recursively.
  212. Args:
  213. ref (:obj:`Any`):
  214. The PDF reference.
  215. Returns:
  216. :obj:`Optional[int]`: The page number or :obj:`None`
  217. if the reference could not be resolved (e.g., remote Go-To
  218. Actions or malformed references).
  219. """
  220. ref_type = self.get_ref_type(ref)
  221. if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
  222. return self.objid_to_pagenum.get(ref.objid)
  223. elif ref_type is PDFRefType.PDF_OBJ_REF:
  224. return self.resolve(ref.resolve())
  225. if ref_type is PDFRefType.DICTIONARY:
  226. return self.resolve(ref["D"])
  227. if ref_type is PDFRefType.LIST:
  228. # Get the PDFObjRef in the list (usually first element).
  229. return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))
  230. if ref_type is PDFRefType.NAMED_REF:
  231. return self.resolve(self.document.get_dest(ref))
  232. return None # PDFRefType.UNK
  233. class BaseMethods:
  234. ''' base methods class
  235. '''
  236. def __init__(self) -> None:
  237. pass
  238. def pandas_read_xls(self, file_path: str, sheetname: str = "Sheet1"):
  239. ''' 读取xls文件方法
  240. '''
  241. return pd.read_excel(file_path, sheet_name=sheetname)
  242. def json_read(self, file_path: str):
  243. ''' 读取json文件方法
  244. '''
  245. with open(file_path, "r", encoding='utf-8') as fp:
  246. return json.load(fp)
  247. def save_file(self, save_data: list, save_path: str, file_format: str):
  248. ''' 保存文件
  249. '''
  250. if file_format == "json":
  251. with open(save_path, 'w', encoding='utf-8') as sf:
  252. sf.write(json.dumps(save_data, ensure_ascii=False))
  253. elif file_format == "xlsx" or file_format == "xls":
  254. with pd.ExcelWriter(save_path) as fp:
  255. save_data.to_excel(fp, sheet_name="Sheet1")
  256. elif file_format == 'txt':
  257. with open(save_path, 'w', encoding='utf-8') as tx:
  258. for data in save_data:
  259. tx.write(data + "\n")
  260. def traverse_file(self, dirpath: str):
  261. '''
  262. 遍历文件夹下文件
  263. '''
  264. filename = tuple()
  265. for root, dir, files in os.walk(dirpath):
  266. for name in files:
  267. filename = filename.__add__((name,))
  268. return filename