get_info.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-07-04 09:59:10
  6. # import os
  7. # from PIL import Image
  8. # from PyPDF2 import PdfReader
  9. # # 读取PDF文件
  10. # with open(pdf_path, 'rb') as file:
  11. # reader = PdfReader(file)
  12. # num_pages = len(reader.pages)
  13. # # 遍历PDF的每一页
  14. # for page_num in range(num_pages):
  15. # page = reader.pages[page_num]
  16. # # 提取页面中的图像
  17. # if '/XObject' in page['/Resources']:
  18. # xobjects = page['/Resources']['/XObject'].get_object()
  19. # for obj in xobjects:
  20. # if xobjects[obj]['/Subtype'] == '/Image':
  21. # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
  22. # data = xobjects[obj].get_data()
  23. # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
  24. # mode = "RGB"
  25. # else:
  26. # mode = "P"
  27. # img = Image.frombytes(mode, size, data)
  28. # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
  29. # img.save(img_path)
  30. # print(f'Image saved: {img_path}')
  31. #######################################################################
  32. # import os
  33. # import re
  34. # import fitz
  35. # def pdf2pic(path, save_path):
  36. # checkXO = r"/Type(?= */XObject)"
  37. # checkIM = r"/Subtype(?= */Image)"
  38. # pdf = fitz.open(path)
  39. # lenXREF = pdf._getXrefLength()
  40. # imgcount = 0
  41. # for i in range(1, lenXREF):
  42. # text = pdf._getXrefString(i)
  43. # isXObject = re.search(checkXO, text)
  44. # isImage = re.search(checkIM, text)
  45. # if not isXObject or not isImage:
  46. # continue
  47. # imgcount += 1
  48. # pix = fitz.Pixmap(pdf, i)
  49. # new_name = f"img_{imgcount}.png"
  50. # if pix.n < 5:
  51. # pix.writePNG(os.path.join(pic_path, new_name))
  52. # else:
  53. # pix0 = fitz.Pixmap(fitz.csRGB, pix)
  54. # pix0.writePNG(os.path.join(pic_path, new_name))
  55. # pix0 = None
  56. # pix = None
  57. # if __name__ == '__main__':
  58. # pdf2pic(pdf_path, image_dir)
  59. #######################################################################
  60. # 标准包导入
  61. import os
  62. import re
  63. import json
  64. from io import BytesIO
  65. from pprint import pprint
  66. # 第三方包导入
  67. import numpy as np
  68. import pandas as pd
  69. import cv2
  70. from tqdm import tqdm
  71. from pdfminer.high_level import extract_pages
  72. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  73. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  74. from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
  75. from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
  76. from pdfminer.pdftypes import (
  77. LITERALS_DCT_DECODE,
  78. LITERALS_JBIG2_DECODE,
  79. LITERALS_JPX_DECODE,
  80. LITERALS_FLATE_DECODE,
  81. )
  82. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  83. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  84. from pdfminer.image import BMPWriter
  85. import pdfplumber
  86. # 自定义包导入
  87. from tools import RefPageNumberResolver
  88. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  89. PIL_ERROR_MESSAGE = "PIL导入错误"
  90. def load_json(data_path: str):
  91. try:
  92. with open(data_path, 'r', encoding='utf-8') as f:
  93. data = json.load(f)
  94. return data
  95. except FileNotFoundError:
  96. print(f"Error: The file '{data_path}' was not found.")
  97. return None
  98. except json.JSONDecodeError as e:
  99. print(f"Error decoding JSON from '{data_path}': {e}")
  100. return None
  101. except Exception as e:
  102. print(f"Error loading JSON from '{data_path}': {e}")
  103. return None
  104. # 定义函数is_title,用于判断输入字符line是否为标题
  105. def is_title(line: str) -> bool:
  106. # 匹配
  107. # 以中文括号或英文括号开头,中间是一到多个汉字数字,然后以括号闭合
  108. # 以数字开头,后面紧跟一个点号
  109. # 以10-19或20-29开头的数字,后面紧跟一个点号
  110. # 以第字开头,后面跟着汉字数字(一到十)或阿拉伯数字,最后以章、节、条结尾
  111. # 汉字数字(一到十),后面跟着一个顿号或"要是"两字
  112. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  113. # 如果找到了匹配的标题模式,则返回True表示这行文本是标题
  114. if title_word:
  115. return True
  116. # 如果上述未匹配到,但满足以附录|参考文献|附表开头,也认定为标题,返回True
  117. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  118. if title_word:
  119. return True
  120. # 上述均不满足,返回False
  121. return False
  122. # 定义函数export_image,用于将LTImage类型的对象保存到path路径下
  123. def export_image(image: LTImage, path: str) -> str:
  124. """Save an LTImage to disk"""
  125. # 获取图像的宽度和高度信息
  126. (width, height) = image.srcsize
  127. ### 检查图像的编码类型
  128. # 获取图像流的过滤器信息
  129. filters = image.stream.get_filters()
  130. # 如果图像只有一个过滤器且过滤器属于LITERALS_DCT_DECODE中的类型
  131. # 则调用_save_jpeg函数将其保存为JPEG格式
  132. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  133. name = _save_jpeg(image, path)
  134. return name
  135. # 如果只有一个过滤器且过滤器属于LITERALS_JPX_DECODE中的类型,则调用_save_jpeg2000函数保存为JPEG2000格式
  136. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  137. name = _save_jpeg2000(image, path)
  138. return name
  139. # elif image.bits == 1:
  140. # name = _save_bmp(image, width, height, (width + 7) // 8, image.bits, path)
  141. # elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
  142. # name = _save_bmp(image, width, height, width * 3, image.bits * 3, path)
  143. # elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
  144. # name = _save_bmp(image, width, height, width, image.bits, path)
  145. # elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
  146. # name = _save_bytes(image)
  147. # else:
  148. # name = _save_raw(image)
  149. # 获取图像流的处理后数据
  150. data = image.stream.get_data()
  151. # #获取图像流的原始数据
  152. raw_data = image.stream.get_rawdata()
  153. # 如果data数据存在,根据不同文件头判断图像类型,保存到相应格式
  154. if data:
  155. if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
  156. path += '.jpg'
  157. elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  158. path += '.png'
  159. elif data[:2] == b'\x42\x4d':
  160. path += '.bmp'
  161. elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
  162. path += '.gif'
  163. elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
  164. path += '.tiff'
  165. elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
  166. name = _save_j2k(image, path)
  167. return name
  168. else:
  169. path += '.unk'
  170. if os.path.exists(path):
  171. return path
  172. else:
  173. with open(path, 'wb') as file:
  174. file.write(data)
  175. return path
  176. # 如果data数据不存在,raw_data数据存在,根据不同文件头判断图像类型,保存到相应格式
  177. elif raw_data:
  178. if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
  179. path += '.jpg'
  180. elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  181. path += '.png'
  182. elif raw_data[:2] == b'\x42\x4d':
  183. path += '.bmp'
  184. elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
  185. path += '.gif'
  186. elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
  187. path += '.tiff'
  188. elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
  189. name = _save_j2k(image, path)
  190. return name
  191. else:
  192. path += '.unk'
  193. if os.path.exists(path):
  194. return path
  195. else:
  196. with open(path, 'wb') as file:
  197. file.write(data)
  198. return path
  199. # 均不存在则返回None
  200. else:
  201. return None
  202. def _save_j2k(image: LTImage, path: str) -> str:
  203. try:
  204. from PIL import Image
  205. except ImportError:
  206. raise ImportError(PIL_ERROR_MESSAGE)
  207. path = path + ".png"
  208. data = image.stream.get_data()
  209. assert data is not None
  210. byte_stream = BytesIO(data)
  211. roiImg = Image.open(byte_stream)
  212. roiImg.save(path)
  213. return path
  214. # _save_jpeg函数用于将给定的LTImage对象保存为JPEG格式图像文件
  215. def _save_jpeg(image: LTImage, path: str) -> str:
  216. """Save a JPEG encoded image"""
  217. # 调用get_rawdata()从image对象的流中获取原始图像数据
  218. raw_data = image.stream.get_rawdata()
  219. assert raw_data is not None
  220. # 将原始文件路径添加.jpg扩展名
  221. path = path + ".jpg"
  222. if os.path.exists(path):
  223. return
  224. # 写入path
  225. with open(path, "wb") as fp:
  226. # 如果图像的颜色空间中包含LITERAL_DEVICE_CMYK
  227. # 表明图像是CMYK格式,需要先将其转换为RGB格式后再保存为JPEG格式
  228. if LITERAL_DEVICE_CMYK in image.colorspace:
  229. try:
  230. from PIL import Image, ImageChops # type: ignore[import]
  231. except ImportError:
  232. raise ImportError(PIL_ERROR_MESSAGE)
  233. # 通过BytesIO(raw_data)将raw_data加载到内存中,方便后续处理
  234. ifp = BytesIO(raw_data)
  235. # 使用PIL打开图像
  236. i = Image.open(ifp)
  237. # 反转图像颜色
  238. i = ImageChops.invert(i)
  239. # 将图像转换为RGB格式
  240. # 这是由于JPEG格式通常不支持CMYK色彩模式,需要先转换为RGB才能保存为JPEG格式
  241. i = i.convert("RGB")
  242. # 将经过处理的图像i保存为JPEG格式存入指定文件对象fp中
  243. i.save(fp, "JPEG")
  244. # 如果图像不是CMYK格式,则直接写入
  245. else:
  246. fp.write(raw_data)
  247. # 返回写入文件的路径
  248. return path
  249. def _save_jpeg2000(image: LTImage, path: str) -> str:
  250. """Save a JPEG 2000 encoded image"""
  251. # 调用get_rawdata()从image对象的流中获取原始图像数据
  252. raw_data = image.stream.get_rawdata()
  253. assert raw_data is not None
  254. # 将原始文件路径添加.png扩展名
  255. path = path + ".png"
  256. if os.path.exists(path):
  257. return
  258. try:
  259. from PIL import Image # type: ignore[import]
  260. except ImportError:
  261. raise ImportError(PIL_ERROR_MESSAGE)
  262. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  263. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  264. # 通过BytesIO(raw_data)将raw_data加载到内存中,方便后续处理
  265. try:
  266. ifp = BytesIO(raw_data)
  267. # 使用PIL打开图像
  268. i = Image.open(ifp)
  269. # 在PIL中,图像格式为RGB
  270. # 在Opencv中,图像格式为BGR
  271. # cv2.cvtColor函数用于颜色空间转换,将RGB格式转化为BGR格式
  272. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  273. cv2.imwrite(path, opencv_image)
  274. except ValueError as e:
  275. pprint(f'Error processing image: {e}')
  276. except OSError as e:
  277. pprint(f'Error processing image: {e}')
  278. return path
  279. # 用于将BMP编码的图像保存到指定路径,并返回该路径
  280. def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
  281. """Save a BMP encoded image"""
  282. # 获取原始数据
  283. data = image.stream.get_data()
  284. # 将原始文件路径添加.bmp扩展名
  285. path = path + ".bmp"
  286. # 打开文件路径,写入data
  287. with open(path, "wb") as fp:
  288. fp.write(data)
  289. return path
  290. # main_parse函数用于:
  291. # 解析pdf文件
  292. # 提取文本和图片信息
  293. # 将标题信息保存为json文件
  294. def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
  295. # 用于存储文本和图像
  296. texts = []
  297. images = []
  298. # 读取PDF文件并提取页面
  299. # 调用pdfminer中的extract_page函数提取每一页的页面布局page_layout
  300. for page_number, page_layout in enumerate(extract_pages(pdf_path)):
  301. title_index = 0
  302. image_index = 0
  303. # 遍历页面布局中的每一个元素
  304. for element in page_layout:
  305. # 如果元素为直线,pass不处理
  306. if isinstance(element, LTLine):
  307. pass
  308. # 如果元素为矩形,pass不处理
  309. elif isinstance(element, LTRect):
  310. pass
  311. # 如果元素为LTTextBoxHorizontal且包含一个对象
  312. # 则提取文本并判断该文本是否为标题
  313. # 如果当前文本是标题,则将标题信息以字典形式存储在texts列表中
  314. # 标题信息包括:
  315. # 标题索引(title_index)
  316. # 页面序号(page_number)
  317. # 文本框边界框(element.bbox)
  318. # 标题文本信息(text)
  319. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  320. # 提取文本
  321. text = element.get_text().strip()
  322. # # 假设标题通常是一行且字体较大
  323. if text and (is_title(text) or element.height > 15):
  324. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  325. title_index += 1
  326. # 如果元素为LTFigure,则遍历其内部的对象
  327. # 如果该对象是LTFigure,则提取出图片信息并保存在本地
  328. elif isinstance(element, LTFigure):
  329. for e_obj in element._objs:
  330. if isinstance(e_obj, LTImage):
  331. # 提取图片数据
  332. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  333. image_file = export_image(e_obj, image_file)
  334. images.append(image_file)
  335. pprint(f'Image saved: {image_file}')
  336. image_index += 1
  337. # 最终将标题信息保存为本地的json文件
  338. with open(title_path, 'w', encoding='utf-8') as fp:
  339. json.dump(texts, fp, indent=4, ensure_ascii=False)
  340. # table_parse函数用于从pdf文件中解析表格,根据表格内容和布局特征将其结构化存储为json格式
  341. def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json', start_page_number: int = None, end_page_number: int = None) -> list:
  342. """pdf表格解析功能
  343. @pdf_path
  344. @title_path
  345. @start_title
  346. @end_title
  347. @table_path
  348. @start_page_number
  349. @end_page_number
  350. """
  351. # pdf_path;要解析的pdf文件路径
  352. # title_path:包含标题信息的json文件路径
  353. # start_title / end_title:用于标识需要解析的pdf页范围的起始和结束标题
  354. # table_path:存储解析表格数据后存储的json文件路径
  355. # start_page_number / end_page_number:指定需要解析的pdf页码范围
  356. # 如果未提供,则从title_path中查找start_title和end_title对应的页码
  357. # 用于存储解析后的表格数据
  358. tables = []
  359. # 如果start_page_number或者end_page_number有一个为None
  360. if (start_page_number == None) or (end_page_number == None):
  361. # 读取title_path
  362. df = pd.read_json(title_path)
  363. # 筛选df['text']列等于start_title的行,找到这些行中page_number列的最大值
  364. start_page_number = df[df['text'] == start_title].page_number.max()
  365. # 筛选df['text']列等于end_title的行,找到这些行中page_number列的最大值
  366. end_page_number = df[df['text'] == end_title].page_number.max()
  367. # concat_table为内部嵌入函数
  368. # 将解析的每个表格添加到tables列表中
  369. # 根据表格特征判断是新表格还是追加到现有表格中
  370. # tables列表存储每个表格的页码、表头长度、列数、表格内容、置信度
  371. def concat_table(tables, table):
  372. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  373. @tables
  374. @table
  375. """
  376. # 对表格中每个单元格内容进行处理,去除其中空格
  377. # first存储第一行处理后的单元格内容
  378. # tail存储最后一行处理后的单元格内容
  379. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  380. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  381. # 如果表格行数大于1,用second存储第二行处理后的单元格内容
  382. if len(table) > 1:
  383. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  384. ### 判断表格添加方式
  385. # 如果HEADERS与first中的交集元素数量 > 2
  386. # 证明找到了大量表头元素,对应为一张新表
  387. # 将当前表格table添加为一个新表格到tables列表中,并设置confidence=1
  388. # 如果当前表格的第一行与上一个表格的最后一行的列数相同且上一个表格页码中包含当前页码i - 1
  389. # 将当前表格table的内容拼接到上一个表格的内容中
  390. # 如果以上两种情况均不符合,则将当前表格table添加为一个新表格到tables列表中,并设置confidence=0
  391. if len(HEADERS & set(first)) > 2:
  392. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  393. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  394. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  395. # pprint("有空列,不是单独表,直接合并")
  396. tables[-1]['page_numbers'].append(i)
  397. tables[-1]['table'].extend(table)
  398. else:
  399. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  400. return tables
  401. # 打开pdf_path
  402. # 遍历start_page_number -> end_page_number的页面
  403. # 提取出所有表格内容到tables中
  404. with pdfplumber.open(pdf_path) as pdf:
  405. for i in range(start_page_number, end_page_number):
  406. for table in pdf.pages[i].extract_tables():
  407. tables = concat_table(tables, table)
  408. # 将tables中表格信息保存为json
  409. with open(table_path, 'w', encoding='utf-8') as fp:
  410. json.dump(tables, fp, indent=4, ensure_ascii=False)
  411. return tables
  412. class PdfExtractAttr(object):
  413. def __init__(self, file_path: str):
  414. """PDF文件解析
  415. @file_path
  416. """
  417. super(PdfExtractAttr, self).__init__()
  418. self.file_path = file_path
  419. self.details = []
  420. self.tables = []
  421. # parse_outline用于解析pdf文件大纲,并将解析结果存储为json文件并打印
  422. def parse_outline(self, out_path: str):
  423. """PDF大纲解析
  424. """
  425. if os.path.exists(out_path):
  426. results = load_json(out_path)
  427. else:
  428. results = []
  429. with open(self.file_path, "rb") as fp:
  430. try:
  431. # parser用于解析pdf文件流
  432. parser = PDFParser(fp)
  433. # document用于获取解析后到文档对象
  434. document = PDFDocument(parser)
  435. # 将解析后到pdf文档传入tool类中的RefPageNumberResolver
  436. # 其用于解析文档中对象所在页面
  437. ref_pagenum_resolver = RefPageNumberResolver(document)
  438. # 调用document.get_outlines()获取pdf大纲信息
  439. outlines = document.get_outlines()
  440. # 遍历outlines列表
  441. # 每个元素包含:
  442. # 大纲的级别level
  443. # 大纲标题title
  444. # 大纲目标dest
  445. # 动作a
  446. # 节se
  447. # 根据不同情况(是否存在目标dest、动作a、节se)来获取相应页码page_num
  448. # 将解析后的信息以字典形式添加到results列表中
  449. for (level, title, dest, a, se) in outlines:
  450. if dest:
  451. page_num = ref_pagenum_resolver.resolve(dest)
  452. elif a:
  453. page_num = ref_pagenum_resolver.resolve(a)
  454. elif se:
  455. page_num = ref_pagenum_resolver.resolve(se)
  456. else:
  457. page_num = None
  458. results.append({'level': level, 'title': title, 'page_number': page_num})
  459. # pdf文件中没有大纲信息
  460. except PDFNoOutlines:
  461. print("No outlines found.")
  462. # pdf文件损坏或不是有效的pdf文件
  463. except PDFSyntaxError:
  464. print("Corrupted PDF or non-PDF file.")
  465. finally:
  466. parser.close()
  467. # 将results存储为outlines.json文件
  468. with open(out_path, 'w', encoding='utf-8') as op:
  469. json.dump(results, op, indent=4, ensure_ascii=False)
  470. # 输出results
  471. print(results)
  472. return results
  473. # parse_text方法用于解析PDF文本
  474. # 具体将每一页中的文本元素提取出来
  475. # 并根据其在页面中的位置和布局进行分析和归类
  476. def parse_text(self, out_path):
  477. """文本解析
  478. """
  479. if os.path.exists(out_path):
  480. self.details = load_json(out_path)
  481. else:
  482. # 循环遍历每一页的布局
  483. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  484. # 遍历当前页面中的元素
  485. for element in page_layout:
  486. # 如果当前元素属于LTTextBoxHorizontal类型
  487. # 计算文本框左侧、右侧、上侧、下侧距离页边界的距离以及文本框的宽度
  488. if isinstance(element, LTTextBoxHorizontal):
  489. # 距离左侧
  490. left = element.x0
  491. # 距离右侧
  492. right = (page_layout.width - element.x1)
  493. # 距离上侧
  494. top = (page_layout.height - element.y1)
  495. # 距离下侧
  496. button = element.y0
  497. # 文本宽度
  498. width = element.width
  499. # 确认文本框的对齐方式
  500. if (left > right) and (abs(left - right) > 100):
  501. alignment = 'right'
  502. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  503. alignment = 'center'
  504. else:
  505. alignment = 'left'
  506. # 将element的解析结果存储到列表中
  507. self.details.append({
  508. 'page_number': page_number,
  509. 'index': element.index,
  510. 'x0': element.bbox[0],
  511. 'y0': element.bbox[1],
  512. 'x1': element.bbox[2],
  513. 'y1': element.bbox[3],
  514. 'alignment': alignment,
  515. 'lines': len(element._objs),
  516. 'text': element.get_text().strip(),
  517. 'is_table_name': element.get_text().strip().endswith('表')
  518. })
  519. with open(out_path, 'w', encoding='utf-8') as fp:
  520. json.dump(self.details, fp, indent=4, ensure_ascii=False)
  521. # 转为pandas的DataFrame格式,存储到self.detail_df中
  522. self.detail_df = pd.DataFrame(self.details)
  523. return self.details
  524. # 与之前的函数一致,此函数专注于解析某一页的表格数据
  525. def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
  526. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  527. @table
  528. """
  529. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  530. # 如果指定当前table为新表(即new=True),直接添加为新表
  531. if new:
  532. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  533. return
  534. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  535. # 表格行数 > 1
  536. if len(table) > 1:
  537. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  538. else:
  539. second = None
  540. # pprint(first)
  541. if len(HEADERS & set(first)) > 2:
  542. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  543. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  544. elif second and (len(HEADERS & set(second)) > 2):
  545. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  546. if not table_name:
  547. first = [i for i in first if i]
  548. if len(first) == 1:
  549. table_name = "".join(first)
  550. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  551. # TODO 目前会因为开头一页具备多张表格而导致此时self.tables=[],因此使用self.tables[-1]出现list index out of range的情况
  552. # TODO 处理self.tables[-1]出现list index out of range的情况
  553. # 添加判断条件(当self.tables中没有表格项时自动添加为新表)
  554. elif len(self.tables) == 0:
  555. # pprint("判断为起始表格,生成新表!")
  556. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  557. elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  558. # pprint("有空列,不是单独表,直接合并")
  559. self.tables[-1]['page_numbers'].append(page_number)
  560. self.tables[-1]['table'].extend(table)
  561. else:
  562. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  563. # 表格解析的主函数
  564. ### 注意!!self.detail_df存储所有LTTextBoxHorizontal类(文本框)的元素细节
  565. def parse_table(self, out_path: str):
  566. """表格解析
  567. """
  568. with pdfplumber.open(self.file_path) as pdf:
  569. # 遍历pdf的每一页
  570. for page_number, page_layout in enumerate(pdf.pages):
  571. # 查询是否存在表格
  572. tables = page_layout.find_tables()
  573. # 检测到该页面存在一个表格,对其进行合并判断
  574. if len(tables) == 1:
  575. table = tables[0]
  576. # 获取当前表格的边检框坐标
  577. x0, y0, x1, y1 = table.bbox
  578. # 查询detail_df中是否有符合条件的表格标题
  579. try:
  580. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  581. except:
  582. continue
  583. # 如果找不到符合条件的表格标题
  584. # 则调用concat_table()
  585. # 将表格内容连接起来
  586. # 如果找到了符合条件的表格标题
  587. # 则先获取表格标题
  588. # 将表格标题和内容一起传递给concat_table()
  589. if table_title_df.empty:
  590. print(f'processing page_number: {page_number}')
  591. self.concat_table(table.extract(), page_number=page_number)
  592. else:
  593. table_title_name = table_title_df.iloc[0]['text']
  594. print(f'processing page_number with table_name: {table_title_name}')
  595. self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
  596. table = tables[0]
  597. #self.concat_table(table.extract(), table_title_name)
  598. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  599. ### 暂时未对一页具有多个表格的情况进行处理
  600. elif len(tables) > 1:
  601. print(f'current page {page_number} has multiple tables')
  602. # import pdb; pdb.set_trace()
  603. #TODO 暂未对一页多张表格的table_name匹配算法进行实现
  604. # 对第一个表格进行合并判断
  605. first_table = tables[0]
  606. self.concat_table(first_table.extract(), page_number=page_number)
  607. # 剩余表格指定new = True
  608. for table_index in range(1, len(tables)):
  609. self.concat_table(tables[table_index].extract(), page_number=page_number, new=True)
  610. with open(out_path, 'w', encoding='utf-8') as fp:
  611. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  612. return self.tables
  613. if __name__ == '__main__':
  614. pdf_path = './南方电网数字研究院有限公司.pdf'
  615. title_path = './南方电网数字研究院有限公司.json'
  616. image_dir = './test_images'
  617. os.makedirs(image_dir, exist_ok=True)
  618. main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
  619. # agent = PdfExtractAttr(file_path=pdf_path)
  620. # agent.parse_outline()
  621. # agent.parse_text()
  622. # agent.parse_table()