get_info.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-08-27 14:50:15
  6. # import os
  7. # from PIL import Image
  8. # from PyPDF2 import PdfReader
  9. # # 读取PDF文件
  10. # with open(pdf_path, 'rb') as file:
  11. # reader = PdfReader(file)
  12. # num_pages = len(reader.pages)
  13. # # 遍历PDF的每一页
  14. # for page_num in range(num_pages):
  15. # page = reader.pages[page_num]
  16. # # 提取页面中的图像
  17. # if '/XObject' in page['/Resources']:
  18. # xobjects = page['/Resources']['/XObject'].get_object()
  19. # for obj in xobjects:
  20. # if xobjects[obj]['/Subtype'] == '/Image':
  21. # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
  22. # data = xobjects[obj].get_data()
  23. # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
  24. # mode = "RGB"
  25. # else:
  26. # mode = "P"
  27. # img = Image.frombytes(mode, size, data)
  28. # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
  29. # img.save(img_path)
  30. # print(f'Image saved: {img_path}')
  31. #######################################################################
  32. # import os
  33. # import re
  34. # import fitz
  35. # def pdf2pic(path, save_path):
  36. # checkXO = r"/Type(?= */XObject)"
  37. # checkIM = r"/Subtype(?= */Image)"
  38. # pdf = fitz.open(path)
  39. # lenXREF = pdf._getXrefLength()
  40. # imgcount = 0
  41. # for i in range(1, lenXREF):
  42. # text = pdf._getXrefString(i)
  43. # isXObject = re.search(checkXO, text)
  44. # isImage = re.search(checkIM, text)
  45. # if not isXObject or not isImage:
  46. # continue
  47. # imgcount += 1
  48. # pix = fitz.Pixmap(pdf, i)
  49. # new_name = f"img_{imgcount}.png"
  50. # if pix.n < 5:
  51. # pix.writePNG(os.path.join(pic_path, new_name))
  52. # else:
  53. # pix0 = fitz.Pixmap(fitz.csRGB, pix)
  54. # pix0.writePNG(os.path.join(pic_path, new_name))
  55. # pix0 = None
  56. # pix = None
  57. # if __name__ == '__main__':
  58. # pdf2pic(pdf_path, image_dir)
  59. #######################################################################
  60. # 标准包导入
  61. import os
  62. import re
  63. import json
  64. from io import BytesIO
  65. from pprint import pprint
  66. from typing import Optional
  67. # 第三方包导入
  68. import cv2
  69. import numpy as np
  70. import pandas as pd
  71. from pdfminer.high_level import extract_pages
  72. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  73. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  74. from pdfminer.pdftypes import (
  75. LITERALS_DCT_DECODE,
  76. LITERALS_JBIG2_DECODE,
  77. LITERALS_JPX_DECODE,
  78. LITERALS_FLATE_DECODE,
  79. )
  80. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  81. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  82. import pdfplumber
  83. import camelot
  84. # 自定义包导入
  85. from tools import RefPageNumberResolver
  86. PIL_ERROR_MESSAGE = "No module named 'PIL', please run 'pip install pillow'"
  87. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  88. pattern_1 = re.compile(r'^\d(\d*\.?\d*)+\d(%)?')
  89. pattern_2 = re.compile('^第[一二三四五六七八九十]+|^[一二三四五六七八九十\d]+、|^[\(\(][一二三四五六七八九十]+[\)\)]')
  90. pattern_3 = re.compile('^附录|^参考文献|^附表')
  91. def is_title(line: str) -> bool:
  92. """
  93. 判断某行文本释放为标题
  94. Args:
  95. line: 文本行
  96. Results:
  97. 是否是标题
  98. """
  99. # if re.fullmatch(r'^\d(\d*\.?\d*)+\d(%)?', line.strip()):
  100. if pattern_1.fullmatch(line.strip()):
  101. return False
  102. # title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  103. title_word = pattern_2.findall(line.strip())
  104. if title_word:
  105. return True
  106. title_word = pattern_3.findall(line.strip())
  107. if title_word:
  108. return True
  109. return False
  110. def export_image(image: LTImage, path: str) -> str:
  111. """Save an LTImage to disk"""
  112. (width, height) = image.srcsize
  113. filters = image.stream.get_filters()
  114. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  115. name = _save_jpeg(image, path)
  116. return name
  117. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  118. name = _save_jpeg2000(image, path)
  119. return name
  120. data = image.stream.get_data()
  121. raw_data = image.stream.get_rawdata()
  122. if data:
  123. if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
  124. path += '.jpg'
  125. with open(path, 'wb') as file:
  126. file.write(data)
  127. return path
  128. elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  129. path += '.png'
  130. with open(path, 'wb') as file:
  131. file.write(data)
  132. return path
  133. elif data[:2] == b'\x42\x4d':
  134. path += '.bmp'
  135. with open(path, 'wb') as file:
  136. file.write(data)
  137. return path
  138. elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
  139. path += '.gif'
  140. with open(path, 'wb') as file:
  141. file.write(data)
  142. return path
  143. elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
  144. path += '.tiff'
  145. with open(path, 'wb') as file:
  146. file.write(data)
  147. return path
  148. elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
  149. name = _save_j2k(image, path)
  150. return name
  151. else:
  152. path += '.unk'
  153. with open(path, 'wb') as file:
  154. file.write(data)
  155. return path
  156. elif raw_data:
  157. if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
  158. path += '.jpg'
  159. with open(path, 'wb') as file:
  160. file.write(raw_data)
  161. return path
  162. elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  163. path += '.png'
  164. with open(path, 'wb') as file:
  165. file.write(raw_data)
  166. return path
  167. elif raw_data[:2] == b'\x42\x4d':
  168. path += '.bmp'
  169. with open(path, 'wb') as file:
  170. file.write(raw_data)
  171. return path
  172. elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
  173. path += '.gif'
  174. with open(path, 'wb') as file:
  175. file.write(raw_data)
  176. return path
  177. elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
  178. path += '.tiff'
  179. with open(path, 'wb') as file:
  180. file.write(raw_data)
  181. return path
  182. else:
  183. path += '.unk'
  184. with open(path, 'wb') as file:
  185. file.write(raw_data)
  186. return path
  187. else:
  188. return None
  189. def _save_j2k(image: LTImage, path: str) -> str:
  190. try:
  191. from PIL import Image
  192. except ImportError:
  193. raise ImportError(PIL_ERROR_MESSAGE)
  194. path = path + ".png"
  195. data = image.stream.get_data()
  196. assert data is not None
  197. byte_stream = BytesIO(data)
  198. roiImg = Image.open(byte_stream)
  199. roiImg.save(path)
  200. return path
  201. def _save_jpeg(image: LTImage, path: str) -> str:
  202. """Save a JPEG encoded image"""
  203. raw_data = image.stream.get_rawdata()
  204. assert raw_data is not None
  205. path = path + ".jpg"
  206. with open(path, "wb") as fp:
  207. if LITERAL_DEVICE_CMYK in image.colorspace:
  208. try:
  209. from PIL import Image, ImageChops # type: ignore[import]
  210. except ImportError:
  211. raise ImportError(PIL_ERROR_MESSAGE)
  212. ifp = BytesIO(raw_data)
  213. i = Image.open(ifp)
  214. i = ImageChops.invert(i)
  215. i = i.convert("RGB")
  216. i.save(fp, "JPEG")
  217. else:
  218. fp.write(raw_data)
  219. return path
  220. def _save_jpeg2000(image: LTImage, path: str) -> str:
  221. """Save a JPEG 2000 encoded image"""
  222. raw_data = image.stream.get_rawdata()
  223. assert raw_data is not None
  224. path = path + ".png"
  225. try:
  226. from PIL import Image # type: ignore[import]
  227. except ImportError:
  228. raise ImportError(PIL_ERROR_MESSAGE)
  229. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  230. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  231. ifp = BytesIO(raw_data)
  232. i = Image.open(ifp)
  233. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  234. cv2.imwrite(path, opencv_image)
  235. return path
  236. def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
  237. """Save a BMP encoded image"""
  238. data = image.stream.get_data()
  239. path = path + ".bmp"
  240. with open(path, "wb") as fp:
  241. fp.write(data)
  242. return path
  243. def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json', start_page_number: int = None, end_page_number: int = None) -> list:
  244. """pdf表格解析功能
  245. @pdf_path
  246. @title_path
  247. @start_title
  248. @end_title
  249. @table_path
  250. @start_page_number
  251. @end_page_number
  252. """
  253. tables = []
  254. if (start_page_number == None) or (end_page_number == None):
  255. df = pd.read_json(title_path)
  256. start_page_number = df[df['text'] == start_title].page_number.max()
  257. end_page_number = df[df['text'] == end_title].page_number.max()
  258. def concat_table(tables, table):
  259. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  260. @tables
  261. @table
  262. """
  263. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  264. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  265. if len(table) > 1:
  266. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  267. # pprint(first)
  268. if len(HEADERS & set(first)) > 2:
  269. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  270. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  271. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  272. # pprint("有空列,不是单独表,直接合并")
  273. tables[-1]['page_numbers'].append(i)
  274. tables[-1]['table'].extend(table)
  275. else:
  276. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  277. return tables
  278. with pdfplumber.open(pdf_path) as pdf:
  279. for i in range(start_page_number, end_page_number):
  280. for table in pdf.pages[i].extract_tables():
  281. tables = concat_table(tables, table)
  282. with open(table_path, 'w', encoding='utf-8') as fp:
  283. json.dump(tables, fp, indent=4, ensure_ascii=False)
  284. return tables
  285. class PdfExtractAttr(object):
  286. def __init__(self, file_path: str):
  287. """PDF文件解析
  288. @file_path
  289. """
  290. super(PdfExtractAttr, self).__init__()
  291. self.file_path = file_path
  292. self.details = []
  293. self.tables = []
  294. self.content = []
  295. self.chapters = []
  296. self.references = []
  297. self.detail_df = None
  298. self.outlines = None
  299. def parse_title(self) -> list:
  300. """
  301. 标题解析
  302. """
  303. texts = []
  304. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  305. title_index = 0
  306. for element in page_layout:
  307. if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  308. text = element.get_text().strip()
  309. if text and (is_title(text) or element.height > 15):
  310. texts.append({
  311. 'index': title_index,
  312. 'page_number': page_number,
  313. 'bbox': element.bbox,
  314. 'text': text
  315. })
  316. title_index += 1
  317. results = []
  318. for i, text in enumerate(texts):
  319. results.append({
  320. 'title': text['text'],
  321. 'index': text['index'],
  322. 'page_number': text['page_number'],
  323. 'seq_num': i
  324. })
  325. return results
  326. def can_merge_lines(self, line1, line2) -> bool:
  327. """判断两行文本是否可以合并为一段
  328. """
  329. # line1 已结束
  330. if line1.x1 < self.right:
  331. return False
  332. # line2 有缩进
  333. if line2.x0 > self.left:
  334. return False
  335. return True
  336. def main_parse(self, title_path: str = None, section_path: str = None, image_dir: str = None) -> None:
  337. """解析PDF
  338. 参数:
  339. - title_path: str, 标题保存路径
  340. - sections_path: str, 正文保存目录
  341. - image_dir: str, 图片保存目录
  342. """
  343. self.outlines['text'] = ''
  344. # 标题
  345. texts = []
  346. # 图片
  347. images = []
  348. # 读取PDF文件并提取页面
  349. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  350. max_start_row = self.outlines.query(f''' page_number <= {page_number+1} ''').query(''' page_number == page_number.max() ''').query(''' level == level.max() ''')
  351. if not max_start_row.empty:
  352. idx = max_start_row.index.values[0]
  353. else:
  354. idx = len(self.outlines.index)
  355. self.outlines.loc[idx] = {'level': 6, 'title': '', 'page_number': 0, 'text': ''}
  356. # 左侧坐标
  357. x0s = []
  358. # 右侧坐标
  359. x1s = []
  360. title_index = 0
  361. image_index = 0
  362. for element in page_layout:
  363. if isinstance(element, LTTextBoxHorizontal):
  364. x0s.append(element.x0)
  365. x1s.append(element.x1)
  366. if x0s and x1s:
  367. # 左侧边缘
  368. self.left = min(x0s) + 15
  369. # 右侧边缘
  370. self.right = max(x1s) - 15
  371. current = None
  372. for element in page_layout:
  373. if isinstance(element, LTLine):
  374. pass
  375. elif isinstance(element, LTRect):
  376. pass
  377. elif isinstance(element, LTTextBoxHorizontal):
  378. # 文本
  379. text = element.get_text().strip()
  380. # 假设标题通常是一行且字体较大
  381. if len(element._objs) == 1 and text and (is_title(text) or element.height > 15):
  382. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  383. title_index += 1
  384. self.outlines.at[idx, 'text'] += '\n'
  385. self.outlines.at[idx, 'text'] += text
  386. # 正文部分
  387. elif not current or self.can_merge_lines(current, element):# 可以合并
  388. current = element
  389. for line in element:
  390. self.outlines.at[idx, 'text'] += line.get_text().strip()
  391. else:# 不可以合并
  392. for line in element:
  393. self.outlines.at[idx, 'text'] += '\n'
  394. self.outlines.at[idx, 'text'] += line.get_text().strip()
  395. elif image_dir and isinstance(element, LTFigure):
  396. for e_obj in element._objs:
  397. if isinstance(e_obj, LTImage):
  398. # 提取图片数据
  399. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  400. image_file = export_image(e_obj, image_file)
  401. images.append(image_file)
  402. pprint(f'Image saved: {image_file}')
  403. image_index += 1
  404. if title_path:
  405. with open(title_path, 'w', encoding='utf-8') as fp:
  406. json.dump(texts, fp, indent=4, ensure_ascii=False)
  407. if section_path:
  408. self.outlines.to_json(section_path, orient='records', lines=True, force_ascii=False)
  409. def extract_toc(self) -> list:
  410. """PDF大纲解析,依据内容解析
  411. """
  412. results = []
  413. for page_number, page in enumerate(extract_pages(self.file_path)):
  414. is_outline = False
  415. if page_number < 1:
  416. continue
  417. if page_number > 20:
  418. break
  419. lines = []
  420. for element in page:
  421. if isinstance(element, LTTextBoxHorizontal):
  422. for line in element:
  423. lines.append(line.get_text().strip())
  424. for line in lines:
  425. # 检查是否符合目录格式
  426. if line and '......' in line and (line[0].isdigit() or '\u4e00' <= line[0] <= '\u9fff') and line[-1].isdigit():
  427. is_outline = True
  428. # 计算缩进级别
  429. indent_level = 1
  430. # 获取内容
  431. title = re.findall('^[\d\.、]{0,}[\u4e00-\u9fff、()\s]+', line).pop()
  432. # 计算页码
  433. page_n = int(re.findall('\d+$', line).pop())
  434. # 添加到目录结构中
  435. results.append({
  436. "level": indent_level,
  437. "title": title,
  438. "page_number": page_n
  439. })
  440. if not is_outline:
  441. break
  442. return results
  443. def extract_content(self, content_path: str = None) -> list:
  444. self.content = []
  445. with pdfplumber.open(self.file_path) as pdf:
  446. for page in pdf.pages:
  447. self.content.append({
  448. 'page_number': page.page_number - 1,
  449. 'text': page.extract_text()
  450. })
  451. if content_path:
  452. with open(content_path, 'w', encoding='utf-8') as fp:
  453. json.dump(self.content, fp, indent=4, ensure_ascii=False)
  454. return self.content
  455. def parse_outline(self, outline_path: str = None) -> list:
  456. """PDF大纲解析,依据元数据解析,解析失败则调用内容解析
  457. """
  458. results = []
  459. with open(self.file_path, "rb") as fp:
  460. try:
  461. parser = PDFParser(fp)
  462. document = PDFDocument(parser)
  463. ref_pagenum_resolver = RefPageNumberResolver(document)
  464. outlines = document.get_outlines()
  465. for (level, title, dest, a, se) in outlines:
  466. if dest:
  467. page_num = ref_pagenum_resolver.resolve(dest)
  468. elif a:
  469. page_num = ref_pagenum_resolver.resolve(a)
  470. elif se:
  471. page_num = ref_pagenum_resolver.resolve(se)
  472. else:
  473. page_num = None
  474. results.append({'level': level, 'title': title, 'page_number': page_num})
  475. except PDFNoOutlines:
  476. print("No outlines found.")
  477. except PDFSyntaxError:
  478. print("Corrupted PDF or non-PDF file.")
  479. finally:
  480. parser.close()
  481. if not results:
  482. results = self.extract_toc()
  483. if outline_path:
  484. with open(outline_path, 'w', encoding='utf-8') as op:
  485. json.dump(results, op, indent=4, ensure_ascii=False)
  486. self.outlines = pd.DataFrame(results)
  487. return results
  488. def parse_text(self, text_path: Optional[str] = None) -> None:
  489. """文本解析
  490. """
  491. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  492. for element in page_layout:
  493. if isinstance(element, LTTextBoxHorizontal):
  494. # 距离左侧
  495. left = element.x0
  496. # 距离右侧
  497. right = (page_layout.width - element.x1)
  498. # 距离上侧
  499. top = (page_layout.height - element.y1)
  500. # 距离下侧
  501. button = element.y0
  502. # 文本宽度
  503. width = element.width
  504. if (left > right) and (abs(left - right) > 100):
  505. alignment = 'right'
  506. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  507. alignment = 'center'
  508. else:
  509. alignment = 'left'
  510. self.details.append({
  511. 'page_number': page_number,
  512. 'index': element.index,
  513. 'x0': element.bbox[0],
  514. 'y0': element.bbox[1],
  515. 'x1': element.bbox[2],
  516. 'y1': element.bbox[3],
  517. 'alignment': alignment,
  518. 'lines': len(element._objs),
  519. 'text': element.get_text().strip(),
  520. 'is_table_name': element.get_text().strip().endswith('表')
  521. })
  522. if text_path:
  523. with open(text_path, 'w', encoding='utf-8') as fp:
  524. json.dump(self.details, fp, indent=4, ensure_ascii=False)
  525. self.detail_df = pd.DataFrame(self.details)
  526. return self.details
  527. def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
  528. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  529. @table
  530. """
  531. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  532. if new:
  533. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  534. return
  535. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  536. if len(table) > 1:
  537. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  538. else:
  539. second = None
  540. # pprint(first)
  541. if not self.tables or len(HEADERS & set(first)) > 2:
  542. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  543. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  544. elif second and (len(HEADERS & set(second)) > 2):
  545. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  546. if not table_name:
  547. first = [i for i in first if i]
  548. if len(first) == 1:
  549. table_name = "".join(first)
  550. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  551. elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  552. # pprint("有空列,不是单独表,直接合并")
  553. self.tables[-1]['page_numbers'].append(page_number)
  554. self.tables[-1]['table'].extend(table)
  555. else:
  556. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  557. def parse_table_pro(self, table_path: str = 'all_tables.json') -> None:
  558. """表格解析
  559. """
  560. self.tables = []
  561. if self.detail_df == None:
  562. self.parse_text()
  563. with pdfplumber.open(self.file_path) as pdf:
  564. for page_number, page_layout in enumerate(pdf.pages):
  565. # 查询是否存在表格
  566. tables = page_layout.find_tables()
  567. if not tables:
  568. continue
  569. tables_pro = camelot.read_pdf(
  570. self.file_path,
  571. # flavor='stream',
  572. pages=str(page_number+1),
  573. # edge_tol=200,
  574. )
  575. if not tables_pro:
  576. continue
  577. print(len(tables), len(tables_pro))
  578. # 检测到该页面存在一个表格,对其进行合并判断
  579. if (len(tables) != 0) and (len(tables_pro) == 1):
  580. print(f"解析PDF{page_number}页的表格")
  581. # print(f"解析PDF{page_number}页的表格")
  582. table = tables[0]
  583. table_pro = tables_pro[0].df.to_dict(orient='split')['data']
  584. x0, y0, x1, y1 = table.bbox
  585. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  586. if table_title_df.empty:
  587. self.concat_table(table_pro, page_number=page_number)
  588. else:
  589. table_title_name = table_title_df.iloc[0]['text']
  590. self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
  591. table = tables[0]
  592. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  593. elif len(tables_pro) > 1:
  594. print(f"解析PDF{page_number}页的表格")
  595. first_table = tables_pro[0]
  596. self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
  597. for table_index in range(1, len(tables_pro)):
  598. self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
  599. if table_path:
  600. with open(table_path, 'w', encoding='utf-8') as fp:
  601. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  602. return self.tables
  603. if __name__ == '__main__':
  604. pdf_path = './投标文件-修改版9-5-1-1.pdf'
  605. # pdf_path = './南方电网数字研究院有限公司.pdf'
  606. # pdf_path = './2022年度工程类-公招采购资料/2022-2025年度三峡电站9台机组检修密封加工制作重新招标/2022-2025年度三峡电站9台机组检修密封加工制作重新招标招标文件印刷版.pdf'
  607. # title_path = './投标文件-修改版9-5-1-1.json'
  608. # title_path = './投标文件-修改版9-5-1-1-title.json'
  609. # title_path = './南方电网数字研究院有限公司.json'
  610. # section_path = './投标文件-修改版9-5-1-1-section.json'
  611. # section_path = './南方电网数字研究院有限公司-section.json'
  612. # image_dir = './extracted_images'
  613. # os.makedirs(image_dir, exist_ok=True)
  614. # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
  615. # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
  616. # pdf_path = './2022年度工程类-公招采购资料/三峡右岸电站35kV及10kV厂用电系统保护装置换型/三峡右岸电站35kV和10kV厂用电系统保护装置换型招标文件审批稿 (3).pdf'
  617. # table_path = './2022年度工程类-公招采购资料/三峡右岸电站35kV及10kV厂用电系统保护装置换型/三峡右岸电站35kV和10kV厂用电系统保护装置换型招标文件审批稿 (3)-table.json'
  618. pdf_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/三峡左岸及地下电站地坪整治招标文件(发售版).pdf'
  619. table_path = './2022年度工程类-公招采购资料/三峡左岸及地下电站地坪整治/三峡左岸及地下电站地坪整治招标文件(发售版)-table.json'
  620. agent = PdfExtractAttr(file_path=pdf_path)
  621. # agent.parse_outline()
  622. # agent.main_parse(title_path=title_path, section_path=section_path)
  623. agent.parse_table_pro(table_path=table_path)