get_info.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-05 16:29:06
  6. # 标准包导入
  7. import os
  8. import re
  9. import json
  10. from io import BytesIO
  11. from pprint import pprint
  12. from typing import Optional, List
  13. # 第三方包导入
  14. import cv2
  15. import numpy as np
  16. import pandas as pd
  17. from pdfminer.high_level import extract_pages
  18. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  19. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  20. from pdfminer.pdftypes import (
  21. LITERALS_DCT_DECODE,
  22. LITERALS_JBIG2_DECODE,
  23. LITERALS_JPX_DECODE,
  24. LITERALS_FLATE_DECODE,
  25. )
  26. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  27. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  28. import pdfplumber
  29. import camelot
  30. # 自定义包导入
  31. from .tools import RefPageNumberResolver
  32. PIL_ERROR_MESSAGE = "No module named 'PIL', please run 'pip install pillow'"
  33. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  34. pattern_1 = re.compile(r'^\d(\d*\.?\d*)+\d(%)?')
  35. pattern_2 = re.compile(r'^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\(\(][一二三四五六七八九十]+[\)\)]')
  36. pattern_3 = re.compile('^附录|^参考文献|^附表')
  37. def is_title(line: str) -> bool:
  38. """
  39. 判断某行文本释放为标题
  40. Args:
  41. line: 文本行
  42. Results:
  43. 是否是标题
  44. """
  45. # if re.fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line.strip()):
  46. if pattern_1.fullmatch(line.strip()):
  47. return False
  48. # title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  49. title_word = pattern_2.findall(line.strip())
  50. if title_word:
  51. return True
  52. title_word = pattern_3.findall(line.strip())
  53. if title_word:
  54. return True
  55. return False
  56. def export_image(image: LTImage, path: str) -> str:
  57. """Save an LTImage to disk"""
  58. (width, height) = image.srcsize
  59. filters = image.stream.get_filters()
  60. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  61. name = _save_jpeg(image, path)
  62. return name
  63. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  64. name = _save_jpeg2000(image, path)
  65. return name
  66. data = image.stream.get_data()
  67. raw_data = image.stream.get_rawdata()
  68. if data:
  69. if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
  70. path += '.jpg'
  71. with open(path, 'wb') as file:
  72. file.write(data)
  73. return path
  74. elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  75. path += '.png'
  76. with open(path, 'wb') as file:
  77. file.write(data)
  78. return path
  79. elif data[:2] == b'\x42\x4d':
  80. path += '.bmp'
  81. with open(path, 'wb') as file:
  82. file.write(data)
  83. return path
  84. elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
  85. path += '.gif'
  86. with open(path, 'wb') as file:
  87. file.write(data)
  88. return path
  89. elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
  90. path += '.tiff'
  91. with open(path, 'wb') as file:
  92. file.write(data)
  93. return path
  94. elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
  95. name = _save_j2k(image, path)
  96. return name
  97. else:
  98. path += '.unk'
  99. with open(path, 'wb') as file:
  100. file.write(data)
  101. return path
  102. elif raw_data:
  103. if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
  104. path += '.jpg'
  105. with open(path, 'wb') as file:
  106. file.write(raw_data)
  107. return path
  108. elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  109. path += '.png'
  110. with open(path, 'wb') as file:
  111. file.write(raw_data)
  112. return path
  113. elif raw_data[:2] == b'\x42\x4d':
  114. path += '.bmp'
  115. with open(path, 'wb') as file:
  116. file.write(raw_data)
  117. return path
  118. elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
  119. path += '.gif'
  120. with open(path, 'wb') as file:
  121. file.write(raw_data)
  122. return path
  123. elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
  124. path += '.tiff'
  125. with open(path, 'wb') as file:
  126. file.write(raw_data)
  127. return path
  128. else:
  129. path += '.unk'
  130. with open(path, 'wb') as file:
  131. file.write(raw_data)
  132. return path
  133. else:
  134. return None
  135. def _save_j2k(image: LTImage, path: str) -> str:
  136. try:
  137. from PIL import Image
  138. except ImportError:
  139. raise ImportError(PIL_ERROR_MESSAGE)
  140. path = path + ".png"
  141. data = image.stream.get_data()
  142. assert data is not None
  143. byte_stream = BytesIO(data)
  144. roiImg = Image.open(byte_stream)
  145. roiImg.save(path)
  146. return path
  147. def _save_jpeg(image: LTImage, path: str) -> str:
  148. """Save a JPEG encoded image"""
  149. raw_data = image.stream.get_rawdata()
  150. assert raw_data is not None
  151. path = path + ".jpg"
  152. with open(path, "wb") as fp:
  153. if LITERAL_DEVICE_CMYK in image.colorspace:
  154. try:
  155. from PIL import Image, ImageChops # type: ignore[import]
  156. except ImportError:
  157. raise ImportError(PIL_ERROR_MESSAGE)
  158. ifp = BytesIO(raw_data)
  159. i = Image.open(ifp)
  160. i = ImageChops.invert(i)
  161. i = i.convert("RGB")
  162. i.save(fp, "JPEG")
  163. else:
  164. fp.write(raw_data)
  165. return path
  166. def _save_jpeg2000(image: LTImage, path: str) -> str:
  167. """Save a JPEG 2000 encoded image"""
  168. raw_data = image.stream.get_rawdata()
  169. assert raw_data is not None
  170. path = path + ".png"
  171. try:
  172. from PIL import Image # type: ignore[import]
  173. except ImportError:
  174. raise ImportError(PIL_ERROR_MESSAGE)
  175. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  176. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  177. ifp = BytesIO(raw_data)
  178. i = Image.open(ifp)
  179. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  180. cv2.imwrite(path, opencv_image)
  181. return path
  182. def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
  183. """Save a BMP encoded image"""
  184. data = image.stream.get_data()
  185. path = path + ".bmp"
  186. with open(path, "wb") as fp:
  187. fp.write(data)
  188. return path
  189. def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json', start_page_number: int = None, end_page_number: int = None) -> list:
  190. """pdf表格解析功能
  191. @pdf_path
  192. @title_path
  193. @start_title
  194. @end_title
  195. @table_path
  196. @start_page_number
  197. @end_page_number
  198. """
  199. tables = []
  200. if (start_page_number == None) or (end_page_number == None):
  201. df = pd.read_json(title_path)
  202. start_page_number = df[df['text'] == start_title].page_number.max()
  203. end_page_number = df[df['text'] == end_title].page_number.max()
  204. def concat_table(tables, table):
  205. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  206. @tables
  207. @table
  208. """
  209. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  210. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  211. if len(table) > 1:
  212. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  213. # pprint(first)
  214. if len(HEADERS & set(first)) > 2:
  215. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  216. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  217. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  218. # pprint("有空列,不是单独表,直接合并")
  219. tables[-1]['page_numbers'].append(i)
  220. tables[-1]['table'].extend(table)
  221. else:
  222. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  223. return tables
  224. with pdfplumber.open(pdf_path) as pdf:
  225. for i in range(start_page_number, end_page_number):
  226. for table in pdf.pages[i].extract_tables():
  227. tables = concat_table(tables, table)
  228. with open(table_path, 'w', encoding='utf-8') as fp:
  229. json.dump(tables, fp, indent=4, ensure_ascii=False)
  230. return tables
  231. class PdfExtractAttr(object):
  232. def __init__(self, file_path: str):
  233. """
  234. PDF文件解析
  235. Args:
  236. file_path: PDF文件路径
  237. """
  238. super(PdfExtractAttr, self).__init__()
  239. self.file_path = file_path
  240. self.tables = []
  241. self.content = []
  242. self.chapters = []
  243. self.references = []
  244. self.detail_df = None
  245. self.outlines = None
  246. self.left = 0
  247. self.right = 0
  248. def can_merge_lines(self, line1: LTTextBoxHorizontal, line2: LTTextBoxHorizontal) -> bool:
  249. """判断两行文本是否可以合并为一段
  250. """
  251. # line1 已结束 或 line2 有缩进
  252. if (line1.x1 < self.right) or (line2.x0 > self.left):
  253. return False
  254. else:
  255. return True
  256. def parse_title(self, title_path: Optional[str] = None) -> list:
  257. """
  258. 标题解析,用于报价唯一
  259. Args:
  260. title_path: 保存路径
  261. Returns:
  262. results: 标题列表
  263. """
  264. results = []
  265. seq_num = 0
  266. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  267. title_index = 0
  268. for element in page_layout:
  269. if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  270. text = element.get_text().strip()
  271. if text and (is_title(text) or element.height > 15):
  272. results.append({
  273. 'index': title_index,
  274. 'page_number': page_number,
  275. 'bbox': element.bbox,
  276. 'text': text,
  277. 'title': text,
  278. 'seq_num': seq_num
  279. })
  280. seq_num += 1
  281. title_index += 1
  282. if title_path:
  283. with open(title_path, 'w', encoding='utf-8') as fp:
  284. json.dump(results, fp, indent=4, ensure_ascii=False)
  285. return results
  286. def parse_image(self, image_dir: str, image_meta_path: Optional[str] = None) -> List[dict]:
  287. """
  288. 解析PDF中的图片
  289. Args:
  290. image_dir: 解析目录
  291. Returns:
  292. image_list: 图片列表
  293. """
  294. image_list = []
  295. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  296. image_index = 0
  297. for element in page_layout:
  298. if isinstance(element, LTFigure):
  299. for e_obj in element._objs:
  300. if isinstance(e_obj, LTImage):
  301. # 提取图片数据
  302. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  303. image_file = export_image(e_obj, image_file)
  304. image_list.append({
  305. "image_index": image_index,
  306. "page_number": page_number,
  307. "image_name": image_file
  308. })
  309. image_index += 1
  310. if image_meta_path:
  311. with open(image_meta_path, 'w', encoding='utf-8') as fp:
  312. json.dump(image_list, fp, indent=4, ensure_ascii=False)
  313. return image_list
  314. def main_parse(self, title_path: str = None, section_path: str = None) -> None:
  315. """解析PDF
  316. 参数:
  317. - title_path: str, 标题保存路径
  318. - sections_path: str, 正文保存目录
  319. """
  320. self.outlines['text'] = ''
  321. # 标题
  322. title_list = []
  323. # 读取PDF文件并提取页面
  324. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  325. max_start_row = self.outlines.query(f''' page_number <= {page_number+1} ''').query(''' page_number == page_number.max() ''').query(''' level == level.max() ''')
  326. if not max_start_row.empty:
  327. idx = max_start_row.index.values[0]
  328. else:
  329. idx = len(self.outlines.index)
  330. self.outlines.loc[idx] = {'level': 6, 'title': '', 'page_number': 0, 'text': ''}
  331. # 左侧坐标
  332. x0s = []
  333. # 右侧坐标
  334. x1s = []
  335. title_index = 0
  336. for element in page_layout:
  337. if isinstance(element, LTTextBoxHorizontal):
  338. x0s.append(element.x0)
  339. x1s.append(element.x1)
  340. if x0s and x1s:
  341. # 左侧边缘
  342. self.left = min(x0s) + 15
  343. # 右侧边缘
  344. self.right = max(x1s) - 15
  345. current = None
  346. for element in page_layout:
  347. if isinstance(element, LTLine):
  348. pass
  349. elif isinstance(element, LTRect):
  350. pass
  351. elif isinstance(element, LTTextBoxHorizontal):
  352. # 文本
  353. text = element.get_text().strip()
  354. # 假设标题通常是一行且字体较大
  355. if len(element._objs) == 1 and text and (is_title(text) or element.height > 15):
  356. title_list.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  357. title_index += 1
  358. self.outlines.at[idx, 'text'] += '\n'
  359. self.outlines.at[idx, 'text'] += text
  360. # 正文部分
  361. elif not current or self.can_merge_lines(current, element): # 可以合并
  362. current = element
  363. for line in element:
  364. self.outlines.at[idx, 'text'] += line.get_text().strip()
  365. else: # 不可以合并
  366. for line in element:
  367. self.outlines.at[idx, 'text'] += '\n'
  368. self.outlines.at[idx, 'text'] += line.get_text().strip()
  369. if title_path:
  370. with open(title_path, 'w', encoding='utf-8') as fp:
  371. json.dump(title_list, fp, indent=4, ensure_ascii=False)
  372. if section_path:
  373. self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
  374. return tilte_list
  375. def extract_toc(self) -> list:
  376. """PDF大纲解析,依据内容解析
  377. """
  378. results = []
  379. for page_number, page in enumerate(extract_pages(self.file_path)):
  380. is_outline = False
  381. if page_number < 1:
  382. continue
  383. if page_number > 20:
  384. break
  385. lines = []
  386. for element in page:
  387. if isinstance(element, LTTextBoxHorizontal):
  388. for line in element:
  389. lines.append(line.get_text().strip())
  390. for line in lines:
  391. # 检查是否符合目录格式
  392. if line and '......' in line and (line[0].isdigit() or '\u4e00' <= line[0] <= '\u9fff') and line[-1].isdigit():
  393. is_outline = True
  394. # 计算缩进级别
  395. indent_level = 1
  396. # 获取内容
  397. title = re.findall(r'^[\d\.、]{0,}[\u4e00-\u9fff、()\s]+', line).pop()
  398. # 计算页码
  399. page_n = int(re.findall(r'\d+$', line).pop())
  400. # 添加到目录结构中
  401. results.append({
  402. "level": indent_level,
  403. "title": title,
  404. "page_number": page_n
  405. })
  406. if not is_outline:
  407. break
  408. return results
  409. def extract_content(self, content_path: str = None) -> list:
  410. self.content = []
  411. with pdfplumber.open(self.file_path) as pdf:
  412. for page in pdf.pages:
  413. self.content.append({
  414. 'page_number': page.page_number - 1,
  415. 'text': page.extract_text()
  416. })
  417. if content_path:
  418. with open(content_path, 'w', encoding='utf-8') as fp:
  419. json.dump(self.content, fp, indent=4, ensure_ascii=False)
  420. return self.content
  421. def parse_outline(self, outline_path: str = None) -> list:
  422. """PDF大纲解析,依据元数据解析,解析失败则调用内容解析
  423. """
  424. results = []
  425. with open(self.file_path, "rb") as fp:
  426. try:
  427. parser = PDFParser(fp)
  428. document = PDFDocument(parser)
  429. ref_pagenum_resolver = RefPageNumberResolver(document)
  430. outlines = document.get_outlines()
  431. for (level, title, dest, a, se) in outlines:
  432. if dest:
  433. page_num = ref_pagenum_resolver.resolve(dest)
  434. elif a:
  435. page_num = ref_pagenum_resolver.resolve(a)
  436. elif se:
  437. page_num = ref_pagenum_resolver.resolve(se)
  438. else:
  439. page_num = None
  440. results.append({'level': level, 'title': title, 'page_number': page_num})
  441. except PDFNoOutlines:
  442. print("No outlines found.")
  443. except PDFSyntaxError:
  444. print("Corrupted PDF or non-PDF file.")
  445. finally:
  446. parser.close()
  447. if not results:
  448. results = self.extract_toc()
  449. if outline_path:
  450. with open(outline_path, 'w', encoding='utf-8') as op:
  451. json.dump(results, op, indent=4, ensure_ascii=False)
  452. self.outlines = pd.DataFrame(results)
  453. return results
  454. def parse_text(self, text_path: Optional[str] = None) -> List[dict]:
  455. """文本解析
  456. Args:
  457. text_path: 文本保存地址
  458. Returns:
  459. 文本块
  460. """
  461. seq_num = -1
  462. text_line = []
  463. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  464. title_index = 0
  465. for element in page_layout:
  466. if isinstance(element, LTTextBoxHorizontal):
  467. # 距离左侧
  468. left = element.x0
  469. # 距离右侧
  470. right = (page_layout.width - element.x1)
  471. # # 距离上侧
  472. # top = (page_layout.height - element.y1)
  473. # # 距离下侧
  474. # button = element.y0
  475. # 文本宽度
  476. width = element.width
  477. if (left > right) and (abs(left - right) > 100):
  478. alignment = 'right'
  479. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  480. alignment = 'center'
  481. else:
  482. alignment = 'left'
  483. text = element.get_text().strip()
  484. # 判断是否为标题
  485. if text and (is_title(text) or element.height > 15) and (len(element._objs) == 1):
  486. title_index += 1
  487. seq_num += 1
  488. text_type = True
  489. else:
  490. text_type = False
  491. # 判断是否为表名
  492. if text and (text.endswith('表') or text.startswith('表') or text.endswith('清单')):
  493. is_table_name = True
  494. else:
  495. is_table_name = False
  496. text_line.append({
  497. 'page_number': page_number,
  498. 'seq_num': seq_num,
  499. 'index': element.index,
  500. 'title_index': title_index,
  501. 'text': text,
  502. 'is_title': text_type,
  503. 'lines': len(element._objs),
  504. 'is_table_name': is_table_name,
  505. 'x0': element.bbox[0],
  506. 'y0': element.bbox[1],
  507. 'x1': element.bbox[2],
  508. 'y1': element.bbox[3],
  509. 'alignment': alignment,
  510. })
  511. if text_path:
  512. with open(text_path, 'w', encoding='utf-8') as fp:
  513. json.dump(text_line, fp, indent=4, ensure_ascii=False)
  514. self.detail_df = pd.DataFrame(text_line)
  515. return text_line
  516. def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
  517. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  518. @table
  519. """
  520. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  521. if new:
  522. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  523. return
  524. # tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  525. if len(table) > 1:
  526. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  527. else:
  528. second = None
  529. # pprint(first)
  530. if not self.tables or len(HEADERS & set(first)) > 2:
  531. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  532. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  533. elif second and (len(HEADERS & set(second)) > 2):
  534. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  535. if not table_name:
  536. first = [i for i in first if i]
  537. if len(first) == 1:
  538. table_name = "".join(first)
  539. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  540. elif ((page_number - 1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  541. # pprint("有空列,不是单独表,直接合并")
  542. self.tables[-1]['page_numbers'].append(page_number)
  543. self.tables[-1]['table'].extend(table)
  544. else:
  545. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  546. def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
  547. """表格解析
  548. """
  549. self.tables = []
  550. if self.detail_df is None:
  551. self.parse_text()
  552. with pdfplumber.open(self.file_path) as pdf:
  553. for page_number, page_layout in enumerate(pdf.pages):
  554. # 查询是否存在表格
  555. tables = page_layout.find_tables()
  556. if not tables:
  557. continue
  558. tables_pro = camelot.read_pdf(
  559. self.file_path,
  560. # flavor='stream',
  561. pages=str(page_number + 1),
  562. # edge_tol=200,
  563. )
  564. if not tables_pro:
  565. continue
  566. print(len(tables), len(tables_pro))
  567. # 检测到该页面存在一个表格,对其进行合并判断
  568. if (len(tables) != 0) and (len(tables_pro) == 1):
  569. print(f"解析PDF{page_number}页的表格")
  570. # print(f"解析PDF{page_number}页的表格")
  571. table = tables[0]
  572. table_pro = tables_pro[0].df.to_dict(orient='split')['data']
  573. x0, y0, x1, y1 = table.bbox
  574. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  575. if table_title_df.empty:
  576. self.concat_table(table_pro, page_number=page_number)
  577. else:
  578. table_title_name = table_title_df.iloc[0]['text']
  579. self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
  580. table = tables[0]
  581. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  582. elif len(tables_pro) > 1:
  583. print(f"解析PDF{page_number}页的表格")
  584. first_table = tables_pro[0]
  585. self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
  586. for table_index in range(1, len(tables_pro)):
  587. self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
  588. if table_path:
  589. with open(table_path, 'w', encoding='utf-8') as fp:
  590. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  591. return self.tables
  592. if __name__ == '__main__':
  593. pdf_path = './投标文件-修改版9-5-1-1.pdf'
  594. pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/三峡左岸及地下电站地坪整治招标文件(发售版).pdf'
  595. table_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/三峡左岸及地下电站地坪整治招标文件(发售版)-table.json'
  596. agent = PdfExtractAttr(file_path=pdf_path)
  597. agent.parse_table_pro(table_path=table_path)