advanced_info.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. # 标准包导入
  2. import os
  3. import re
  4. import json
  5. from io import BytesIO
  6. from pprint import pprint
  7. # 第三方包导入
  8. import numpy as np
  9. import pandas as pd
  10. import cv2
  11. from pdfminer.high_level import extract_pages
  12. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  13. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  14. from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
  15. from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
  16. from pdfminer.pdftypes import (
  17. LITERALS_DCT_DECODE,
  18. LITERALS_JBIG2_DECODE,
  19. LITERALS_JPX_DECODE,
  20. LITERALS_FLATE_DECODE,
  21. )
  22. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  23. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  24. from pdfminer.image import BMPWriter
  25. import pdfplumber
  26. # 自定义包导入
  27. from tools import RefPageNumberResolver
  28. PIL_ERROR_MESSAGE = "PIL导入错误"
  29. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  30. def is_title(line: str) -> bool:
  31. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  32. if title_word:
  33. return True
  34. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  35. if title_word:
  36. return True
  37. return False
  38. def export_image(image: LTImage, path: str) -> str:
  39. """Save an LTImage to disk"""
  40. (width, height) = image.srcsize
  41. filters = image.stream.get_filters()
  42. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  43. name = _save_jpeg(image, path)
  44. return name
  45. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  46. name = _save_jpeg2000(image, path)
  47. return name
  48. data = image.stream.get_data()
  49. raw_data = image.stream.get_rawdata()
  50. if data:
  51. if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
  52. path += '.jpg'
  53. with open(path, 'wb') as file:
  54. file.write(data)
  55. return path
  56. elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  57. path += '.png'
  58. with open(path, 'wb') as file:
  59. file.write(data)
  60. return path
  61. elif data[:2] == b'\x42\x4d':
  62. path += '.bmp'
  63. with open(path, 'wb') as file:
  64. file.write(data)
  65. return path
  66. elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
  67. path += '.gif'
  68. with open(path, 'wb') as file:
  69. file.write(data)
  70. return path
  71. elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
  72. path += '.tiff'
  73. with open(path, 'wb') as file:
  74. file.write(data)
  75. return path
  76. elif data[:8] == b'\xffO\xffQ\x00/\x00\x00':
  77. name = _save_j2k(image, path)
  78. return name
  79. else:
  80. path += '.unk'
  81. with open(path, 'wb') as file:
  82. file.write(data)
  83. return path
  84. elif raw_data:
  85. if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
  86. path += '.jpg'
  87. with open(path, 'wb') as file:
  88. file.write(raw_data)
  89. return path
  90. elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  91. path += '.png'
  92. with open(path, 'wb') as file:
  93. file.write(raw_data)
  94. return path
  95. elif raw_data[:2] == b'\x42\x4d':
  96. path += '.bmp'
  97. with open(path, 'wb') as file:
  98. file.write(raw_data)
  99. return path
  100. elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
  101. path += '.gif'
  102. with open(path, 'wb') as file:
  103. file.write(raw_data)
  104. return path
  105. elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
  106. path += '.tiff'
  107. with open(path, 'wb') as file:
  108. file.write(raw_data)
  109. return path
  110. else:
  111. path += '.unk'
  112. with open(path, 'wb') as file:
  113. file.write(raw_data)
  114. return path
  115. else:
  116. return None
  117. def _save_j2k(image: LTImage, path: str) -> str:
  118. try:
  119. from PIL import Image
  120. except ImportError:
  121. raise ImportError(PIL_ERROR_MESSAGE)
  122. path = path + ".png"
  123. data = image.stream.get_data()
  124. assert data is not None
  125. byte_stream = BytesIO(data)
  126. roiImg = Image.open(byte_stream)
  127. roiImg.save(path)
  128. return path
  129. def _save_jpeg(image: LTImage, path: str) -> str:
  130. """Save a JPEG encoded image"""
  131. raw_data = image.stream.get_rawdata()
  132. assert raw_data is not None
  133. path = path + ".jpg"
  134. with open(path, "wb") as fp:
  135. if LITERAL_DEVICE_CMYK in image.colorspace:
  136. try:
  137. from PIL import Image, ImageChops # type: ignore[import]
  138. except ImportError:
  139. raise ImportError(PIL_ERROR_MESSAGE)
  140. ifp = BytesIO(raw_data)
  141. i = Image.open(ifp)
  142. i = ImageChops.invert(i)
  143. i = i.convert("RGB")
  144. i.save(fp, "JPEG")
  145. else:
  146. fp.write(raw_data)
  147. return path
  148. def _save_jpeg2000(image: LTImage, path: str) -> str:
  149. """Save a JPEG 2000 encoded image"""
  150. raw_data = image.stream.get_rawdata()
  151. assert raw_data is not None
  152. path = path + ".png"
  153. try:
  154. from PIL import Image # type: ignore[import]
  155. except ImportError:
  156. raise ImportError(PIL_ERROR_MESSAGE)
  157. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  158. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  159. ifp = BytesIO(raw_data)
  160. i = Image.open(ifp)
  161. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  162. cv2.imwrite(path, opencv_image)
  163. return path
  164. def _save_bmp(image: LTImage, width: int, height: int, bytes_per_line: int, bits: int, path: str) -> str:
  165. """Save a BMP encoded image"""
  166. data = image.stream.get_data()
  167. path = path + ".bmp"
  168. with open(path, "wb") as fp:
  169. fp.write(data)
  170. return path
  171. def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
  172. texts = []
  173. images = []
  174. # 读取PDF文件并提取页面
  175. for page_number, page_layout in enumerate(extract_pages(pdf_path)):
  176. title_index = 0
  177. image_index = 0
  178. for element in page_layout:
  179. if isinstance(element, LTLine):
  180. pass
  181. elif isinstance(element, LTRect):
  182. pass
  183. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  184. text = element.get_text().strip()
  185. # # 假设标题通常是一行且字体较大
  186. if text and (is_title(text) or element.height > 15):
  187. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  188. title_index += 1
  189. elif isinstance(element, LTFigure):
  190. for e_obj in element._objs:
  191. if isinstance(e_obj, LTImage):
  192. # 提取图片数据
  193. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  194. image_file = export_image(e_obj, image_file)
  195. images.append(image_file)
  196. pprint(f'Image saved: {image_file}')
  197. image_index += 1
  198. with open(title_path, 'w', encoding='utf-8') as fp:
  199. json.dump(texts, fp, indent=4, ensure_ascii=False)
  200. def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json', start_page_number: int = None, end_page_number: int = None) -> list:
  201. """pdf表格解析功能
  202. @pdf_path
  203. @title_path
  204. @start_title
  205. @end_title
  206. @table_path
  207. @start_page_number
  208. @end_page_number
  209. """
  210. tables = []
  211. if (start_page_number == None) or (end_page_number == None):
  212. df = pd.read_json(title_path)
  213. start_page_number = df[df['text'] == start_title].page_number.max()
  214. end_page_number = df[df['text'] == end_title].page_number.max()
  215. def concat_table(tables, table):
  216. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  217. @tables
  218. @table
  219. """
  220. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  221. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  222. if len(table) > 1:
  223. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  224. # pprint(first)
  225. if len(HEADERS & set(first)) > 2:
  226. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  227. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  228. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  229. # pprint("有空列,不是单独表,直接合并")
  230. tables[-1]['page_numbers'].append(i)
  231. tables[-1]['table'].extend(table)
  232. else:
  233. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  234. return tables
  235. with pdfplumber.open(pdf_path) as pdf:
  236. for i in range(start_page_number, end_page_number):
  237. for table in pdf.pages[i].extract_tables():
  238. tables = concat_table(tables, table)
  239. with open(table_path, 'w', encoding='utf-8') as fp:
  240. json.dump(tables, fp, indent=4, ensure_ascii=False)
  241. return tables
  242. class PdfExtractAttr(object):
  243. def __init__(self, file_path: str):
  244. """PDF文件解析
  245. @file_path
  246. """
  247. super(PdfExtractAttr, self).__init__()
  248. self.file_path = file_path
  249. self.details = []
  250. self.tables = []
  251. self.content = []
  252. def parse_outline(self):
  253. """PDF大纲解析
  254. """
  255. results = []
  256. with open(self.file_path, "rb") as fp:
  257. try:
  258. parser = PDFParser(fp)
  259. document = PDFDocument(parser)
  260. ref_pagenum_resolver = RefPageNumberResolver(document)
  261. outlines = document.get_outlines()
  262. for (level, title, dest, a, se) in outlines:
  263. if dest:
  264. page_num = ref_pagenum_resolver.resolve(dest)
  265. elif a:
  266. page_num = ref_pagenum_resolver.resolve(a)
  267. elif se:
  268. page_num = ref_pagenum_resolver.resolve(se)
  269. else:
  270. page_num = None
  271. results.append({'level': level, 'title': title, 'page_number': page_num})
  272. except PDFNoOutlines:
  273. print("No outlines found.")
  274. except PDFSyntaxError:
  275. print("Corrupted PDF or non-PDF file.")
  276. finally:
  277. parser.close()
  278. with open('outlines.json', 'w', encoding='utf-8') as op:
  279. json.dump(results, op, indent=4, ensure_ascii=False)
  280. print(results)
  281. def extract_content(self) -> list:
  282. with pdfplumber.open(self.file_path) as pdf:
  283. for page in pdf.pages:
  284. self.content.append({
  285. 'page_number': page.page_number - 1,
  286. 'text': page.extract_text()
  287. })
  288. return self.content
  289. def parse_text(self) -> None:
  290. """文本解析
  291. """
  292. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  293. for element in page_layout:
  294. if isinstance(element, LTTextBoxHorizontal):
  295. # 距离左侧
  296. left = element.x0
  297. # 距离右侧
  298. right = (page_layout.width - element.x1)
  299. # 距离上侧
  300. top = (page_layout.height - element.y1)
  301. # 距离下侧
  302. button = element.y0
  303. # 文本宽度
  304. width = element.width
  305. if (left > right) and (abs(left - right) > 100):
  306. alignment = 'right'
  307. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  308. alignment = 'center'
  309. else:
  310. alignment = 'left'
  311. self.details.append({
  312. 'page_number': page_number,
  313. 'index': element.index,
  314. 'x0': element.bbox[0],
  315. 'y0': element.bbox[1],
  316. 'x1': element.bbox[2],
  317. 'y1': element.bbox[3],
  318. 'alignment': alignment,
  319. 'lines': len(element._objs),
  320. 'text': element.get_text().strip(),
  321. 'is_table_name': element.get_text().strip().endswith('表')
  322. })
  323. self.detail_df = pd.DataFrame(self.details)
  324. def concat_table(self, table: list, page_number: int, table_name: str = None) -> None:
  325. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  326. @table
  327. """
  328. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  329. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  330. if len(table) > 1:
  331. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  332. else:
  333. second = None
  334. # pprint(first)
  335. if len(HEADERS & set(first)) > 2:
  336. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  337. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  338. elif second and (len(HEADERS & set(second)) > 2):
  339. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  340. if not table_name:
  341. first = [i for i in first if i]
  342. if len(first) == 1:
  343. table_name = "".join(first)
  344. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  345. elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  346. # pprint("有空列,不是单独表,直接合并")
  347. self.tables[-1]['page_numbers'].append(page_number)
  348. self.tables[-1]['table'].extend(table)
  349. else:
  350. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  351. def parse_table(self) -> None:
  352. """表格解析
  353. """
  354. with pdfplumber.open(self.file_path) as pdf:
  355. for page_number, page_layout in enumerate(pdf.pages):
  356. # 查询是否存在表格
  357. tables = page_layout.find_tables()
  358. # 检测到该页面存在一个表格,对其进行合并判断
  359. if len(tables) == 1:
  360. table = tables[0]
  361. x0, y0, x1, y1 = table.bbox
  362. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  363. if table_title_df.empty:
  364. self.concat_table(table.extract(), page_number=page_number)
  365. else:
  366. table_title_name = table_title_df.iloc[0]['text']
  367. self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
  368. table = tables[0]
  369. #self.concat_table(table.extract(), table_title_name)
  370. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  371. elif len(tables) > 1:
  372. pass
  373. def output(self, table_path: str = 'all_tables.json'):
  374. """结果输出
  375. """
  376. with open(table_path, 'w', encoding='utf-8') as fp:
  377. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  378. return self.tables
  379. if __name__ == '__main__':
  380. pdf_path = './南方电网数字研究院有限公司.pdf'
  381. title_path = './南方电网数字研究院有限公司.json'
  382. image_dir = './extracted_images'
  383. os.makedirs(image_dir, exist_ok=True)
  384. main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
  385. agent = PdfExtractAttr(file_path=pdf_path)
  386. print(agent.extract_content())
  387. agent.parse_outline()
  388. agent.parse_text()
  389. agent.parse_table()
  390. agent.output()