get_info.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-06-11 14:10:56
  6. # import os
  7. # from PIL import Image
  8. # from PyPDF2 import PdfReader
  9. # # 读取PDF文件
  10. # with open(pdf_path, 'rb') as file:
  11. # reader = PdfReader(file)
  12. # num_pages = len(reader.pages)
  13. # # 遍历PDF的每一页
  14. # for page_num in range(num_pages):
  15. # page = reader.pages[page_num]
  16. # # 提取页面中的图像
  17. # if '/XObject' in page['/Resources']:
  18. # xobjects = page['/Resources']['/XObject'].get_object()
  19. # for obj in xobjects:
  20. # if xobjects[obj]['/Subtype'] == '/Image':
  21. # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
  22. # data = xobjects[obj].get_data()
  23. # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
  24. # mode = "RGB"
  25. # else:
  26. # mode = "P"
  27. # img = Image.frombytes(mode, size, data)
  28. # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
  29. # img.save(img_path)
  30. # print(f'Image saved: {img_path}')
  31. #######################################################################
  32. # import os
  33. # import re
  34. # import fitz
  35. # def pdf2pic(path, save_path):
  36. # checkXO = r"/Type(?= */XObject)"
  37. # checkIM = r"/Subtype(?= */Image)"
  38. # pdf = fitz.open(path)
  39. # lenXREF = pdf._getXrefLength()
  40. # imgcount = 0
  41. # for i in range(1, lenXREF):
  42. # text = pdf._getXrefString(i)
  43. # isXObject = re.search(checkXO, text)
  44. # isImage = re.search(checkIM, text)
  45. # if not isXObject or not isImage:
  46. # continue
  47. # imgcount += 1
  48. # pix = fitz.Pixmap(pdf, i)
  49. # new_name = f"img_{imgcount}.png"
  50. # if pix.n < 5:
  51. # pix.writePNG(os.path.join(pic_path, new_name))
  52. # else:
  53. # pix0 = fitz.Pixmap(fitz.csRGB, pix)
  54. # pix0.writePNG(os.path.join(pic_path, new_name))
  55. # pix0 = None
  56. # pix = None
  57. # if __name__ == '__main__':
  58. # pdf2pic(pdf_path, image_dir)
  59. #######################################################################
  60. # 标准包导入
  61. import os
  62. import re
  63. import json
  64. from io import BytesIO
  65. from pprint import pprint
  66. # 第三方包导入
  67. import numpy as np
  68. import pandas as pd
  69. import cv2
  70. from pdfminer.high_level import extract_pages
  71. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  72. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  73. from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
  74. from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
  75. from pdfminer.pdftypes import (
  76. LITERALS_DCT_DECODE,
  77. LITERALS_JBIG2_DECODE,
  78. LITERALS_JPX_DECODE,
  79. LITERALS_FLATE_DECODE,
  80. )
  81. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  82. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  83. import pdfplumber
  84. # 自定义包导入
  85. from tools import RefPageNumberResolver
  86. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  87. def is_title(line: str) -> bool:
  88. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  89. if title_word:
  90. return True
  91. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  92. if title_word:
  93. return True
  94. return False
  95. def export_image(image: LTImage, path: str) -> str:
  96. """Save an LTImage to disk"""
  97. (width, height) = image.srcsize
  98. filters = image.stream.get_filters()
  99. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  100. name = _save_jpeg(image, path)
  101. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  102. name = _save_jpeg2000(image, path)
  103. elif image.bits == 1:
  104. name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
  105. elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
  106. name = _save_bmp(image, width, height, width * 3, image.bits * 3)
  107. elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
  108. name = _save_bmp(image, width, height, width, image.bits)
  109. elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
  110. name = _save_bytes(image)
  111. else:
  112. name = _save_raw(image)
  113. return name
  114. def _save_jpeg(image: LTImage, path: str) -> str:
  115. """Save a JPEG encoded image"""
  116. raw_data = image.stream.get_rawdata()
  117. assert raw_data is not None
  118. path = path + ".jpg"
  119. with open(path, "wb") as fp:
  120. if LITERAL_DEVICE_CMYK in image.colorspace:
  121. try:
  122. from PIL import Image, ImageChops # type: ignore[import]
  123. except ImportError:
  124. raise ImportError(PIL_ERROR_MESSAGE)
  125. ifp = BytesIO(raw_data)
  126. i = Image.open(ifp)
  127. i = ImageChops.invert(i)
  128. i = i.convert("RGB")
  129. i.save(fp, "JPEG")
  130. else:
  131. fp.write(raw_data)
  132. return path
  133. def _save_jpeg2000(image: LTImage, path: str) -> str:
  134. """Save a JPEG 2000 encoded image"""
  135. raw_data = image.stream.get_rawdata()
  136. assert raw_data is not None
  137. path = path + ".png"
  138. try:
  139. from PIL import Image # type: ignore[import]
  140. except ImportError:
  141. raise ImportError(PIL_ERROR_MESSAGE)
  142. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  143. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  144. ifp = BytesIO(raw_data)
  145. i = Image.open(ifp)
  146. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  147. cv2.imwrite(path, opencv_image)
  148. return path
  149. def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
  150. texts = []
  151. images = []
  152. # 读取PDF文件并提取页面
  153. for page_number, page_layout in enumerate(extract_pages(pdf_path)):
  154. title_index = 0
  155. image_index = 0
  156. for element in page_layout:
  157. if isinstance(element, LTLine):
  158. pass
  159. elif isinstance(element, LTRect):
  160. pass
  161. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  162. text = element.get_text().strip()
  163. # # 假设标题通常是一行且字体较大
  164. if text and (is_title(text) or element.height > 15):
  165. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  166. title_index += 1
  167. elif isinstance(element, LTFigure):
  168. for e_obj in element._objs:
  169. if isinstance(e_obj, LTImage):
  170. # 提取图片数据
  171. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  172. image_file = export_image(e_obj, image_file)
  173. images.append(image_file)
  174. pprint(f'Image saved: {image_file}')
  175. image_index += 1
  176. with open(title_path, 'w', encoding='utf-8') as fp:
  177. json.dump(texts, fp, indent=4, ensure_ascii=False)
  178. def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json', start_page_number: int = None, end_page_number: int = None) -> list:
  179. """pdf表格解析功能
  180. @pdf_path
  181. @title_path
  182. @start_title
  183. @end_title
  184. @table_path
  185. @start_page_number
  186. @end_page_number
  187. """
  188. tables = []
  189. if (start_page_number == None) or (end_page_number == None):
  190. df = pd.read_json(title_path)
  191. start_page_number = df[df['text'] == start_title].page_number.max()
  192. end_page_number = df[df['text'] == end_title].page_number.max()
  193. def concat_table(tables, table):
  194. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  195. @tables
  196. @table
  197. """
  198. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  199. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  200. if len(table) > 1:
  201. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  202. # pprint(first)
  203. if len(HEADERS & set(first)) > 2:
  204. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  205. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  206. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  207. # pprint("有空列,不是单独表,直接合并")
  208. tables[-1]['page_numbers'].append(i)
  209. tables[-1]['table'].extend(table)
  210. else:
  211. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  212. return tables
  213. with pdfplumber.open(pdf_path) as pdf:
  214. for i in range(start_page_number, end_page_number):
  215. for table in pdf.pages[i].extract_tables():
  216. tables = concat_table(tables, table)
  217. with open(table_path, 'w', encoding='utf-8') as fp:
  218. json.dump(tables, fp, indent=4, ensure_ascii=False)
  219. return tables
  220. class PdfExtractAttr(object):
  221. def __init__(self, file_path: str):
  222. """PDF文件解析
  223. @file_path
  224. """
  225. super(PdfExtractAttr, self).__init__()
  226. self.file_path = file_path
  227. self.details = []
  228. self.tables = []
  229. def parse_outline(self):
  230. """PDF大纲解析
  231. """
  232. results = []
  233. with open(self.file_path, "rb") as fp:
  234. try:
  235. parser = PDFParser(fp)
  236. document = PDFDocument(parser)
  237. ref_pagenum_resolver = RefPageNumberResolver(document)
  238. outlines = document.get_outlines()
  239. for (level, title, dest, a, se) in outlines:
  240. if dest:
  241. page_num = ref_pagenum_resolver.resolve(dest)
  242. elif a:
  243. page_num = ref_pagenum_resolver.resolve(a)
  244. elif se:
  245. page_num = ref_pagenum_resolver.resolve(se)
  246. else:
  247. page_num = None
  248. results.append({'level': level, 'title': title, 'page_number': page_num})
  249. except PDFNoOutlines:
  250. print("No outlines found.")
  251. except PDFSyntaxError:
  252. print("Corrupted PDF or non-PDF file.")
  253. finally:
  254. parser.close()
  255. with open('outlines.json', 'w', encoding='utf-8') as op:
  256. json.dump(results, op, indent=4, ensure_ascii=False)
  257. print(results)
  258. def parse_text(self) -> None:
  259. """文本解析
  260. """
  261. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  262. for element in page_layout:
  263. if isinstance(element, LTTextBoxHorizontal):
  264. # 距离左侧
  265. left = element.x0
  266. # 距离右侧
  267. right = (page_layout.width - element.x1)
  268. # 距离上侧
  269. top = (page_layout.height - element.y1)
  270. # 距离下侧
  271. button = element.y0
  272. # 文本宽度
  273. width = element.width
  274. if (left > right) and (abs(left - right) > 100):
  275. alignment = 'right'
  276. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  277. alignment = 'center'
  278. else:
  279. alignment = 'left'
  280. self.details.append({
  281. 'page_number': page_number,
  282. 'index': element.index,
  283. 'x0': element.bbox[0],
  284. 'y0': element.bbox[1],
  285. 'x1': element.bbox[2],
  286. 'y1': element.bbox[3],
  287. 'alignment': alignment,
  288. 'lines': len(element._objs),
  289. 'text': element.get_text().strip(),
  290. 'is_table_name': element.get_text().strip().endswith('表')
  291. })
  292. self.detail_df = pd.DataFrame(self.details)
  293. def concat_table(self, table: list, page_number: int, table_name: str = None) -> None:
  294. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  295. @table
  296. """
  297. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  298. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  299. if len(table) > 1:
  300. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  301. else:
  302. second = None
  303. # pprint(first)
  304. if len(HEADERS & set(first)) > 2:
  305. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  306. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  307. elif second and (len(HEADERS & set(second)) > 2):
  308. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  309. if not table_name:
  310. first = [i for i in first if i]
  311. if len(first) == 1:
  312. table_name = "".join(first)
  313. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  314. elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  315. # pprint("有空列,不是单独表,直接合并")
  316. self.tables[-1]['page_numbers'].append(page_number)
  317. self.tables[-1]['table'].extend(table)
  318. else:
  319. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  320. def parse_table(self) -> None:
  321. """表格解析
  322. """
  323. with pdfplumber.open(self.file_path) as pdf:
  324. for page_number, page_layout in enumerate(pdf.pages):
  325. # 查询是否存在表格
  326. tables = page_layout.find_tables()
  327. # 检测到该页面存在一个表格,对其进行合并判断
  328. if len(tables) == 1:
  329. table = tables[0]
  330. x0, y0, x1, y1 = table.bbox
  331. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  332. if table_title_df.empty:
  333. self.concat_table(table.extract(), page_number=page_number)
  334. else:
  335. table_title_name = table_title_df.iloc[0]['text']
  336. self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
  337. table = tables[0]
  338. #self.concat_table(table.extract(), table_title_name)
  339. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  340. elif len(tables) > 1:
  341. pass
  342. def output(self, table_path: str = 'all_tables.json'):
  343. """结果输出
  344. """
  345. with open(table_path, 'w', encoding='utf-8') as fp:
  346. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  347. return self.tables
  348. if __name__ == '__main__':
  349. pdf_path = './投标文件-修改版9-5-1-1.pdf'
  350. title_path = './投标文件-修改版9-5-1-1.json'
  351. image_dir = './extracted_images'
  352. os.makedirs(image_dir, exist_ok=True)
  353. main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
  354. # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
  355. # tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_page_number=0, end_page_number=725)
  356. agent = PdfExtractAttr(file_path=pdf_path)
  357. agent.parse_outline()
  358. agent.parse_text()
  359. agent.parse_table()
  360. agent.output()