get_info.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-06-11 14:10:56
  6. # import os
  7. # from PIL import Image
  8. # from PyPDF2 import PdfReader
  9. # # 读取PDF文件
  10. # with open(pdf_path, 'rb') as file:
  11. # reader = PdfReader(file)
  12. # num_pages = len(reader.pages)
  13. # # 遍历PDF的每一页
  14. # for page_num in range(num_pages):
  15. # page = reader.pages[page_num]
  16. # # 提取页面中的图像
  17. # if '/XObject' in page['/Resources']:
  18. # xobjects = page['/Resources']['/XObject'].get_object()
  19. # for obj in xobjects:
  20. # if xobjects[obj]['/Subtype'] == '/Image':
  21. # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
  22. # data = xobjects[obj].get_data()
  23. # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
  24. # mode = "RGB"
  25. # else:
  26. # mode = "P"
  27. # img = Image.frombytes(mode, size, data)
  28. # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
  29. # img.save(img_path)
  30. # print(f'Image saved: {img_path}')
  31. #######################################################################
  32. # import os
  33. # import re
  34. # import fitz
  35. # def pdf2pic(path, save_path):
  36. # checkXO = r"/Type(?= */XObject)"
  37. # checkIM = r"/Subtype(?= */Image)"
  38. # pdf = fitz.open(path)
  39. # lenXREF = pdf._getXrefLength()
  40. # imgcount = 0
  41. # for i in range(1, lenXREF):
  42. # text = pdf._getXrefString(i)
  43. # isXObject = re.search(checkXO, text)
  44. # isImage = re.search(checkIM, text)
  45. # if not isXObject or not isImage:
  46. # continue
  47. # imgcount += 1
  48. # pix = fitz.Pixmap(pdf, i)
  49. # new_name = f"img_{imgcount}.png"
  50. # if pix.n < 5:
  51. # pix.writePNG(os.path.join(pic_path, new_name))
  52. # else:
  53. # pix0 = fitz.Pixmap(fitz.csRGB, pix)
  54. # pix0.writePNG(os.path.join(pic_path, new_name))
  55. # pix0 = None
  56. # pix = None
  57. # if __name__ == '__main__':
  58. # pdf2pic(pdf_path, image_dir)
  59. #######################################################################
  60. import os
  61. import re
  62. import json
  63. from io import BytesIO
  64. from pprint import pprint
  65. import numpy as np
  66. import cv2
  67. from pdfminer.high_level import extract_pages
  68. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  69. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  70. from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
  71. from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
  72. from pdfminer.pdftypes import (
  73. LITERALS_DCT_DECODE,
  74. LITERALS_JBIG2_DECODE,
  75. LITERALS_JPX_DECODE,
  76. LITERALS_FLATE_DECODE,
  77. )
  78. def is_title(line: str) -> bool:
  79. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  80. if title_word:
  81. return True
  82. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  83. if title_word:
  84. return True
  85. return False
  86. def export_image(image: LTImage, path: str) -> str:
  87. """Save an LTImage to disk"""
  88. (width, height) = image.srcsize
  89. filters = image.stream.get_filters()
  90. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  91. name = _save_jpeg(image, path)
  92. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  93. name = _save_jpeg2000(image, path)
  94. elif image.bits == 1:
  95. name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
  96. elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
  97. name = _save_bmp(image, width, height, width * 3, image.bits * 3)
  98. elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
  99. name = _save_bmp(image, width, height, width, image.bits)
  100. elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
  101. name = _save_bytes(image)
  102. else:
  103. name = _save_raw(image)
  104. return name
  105. def _save_jpeg(image: LTImage, path: str) -> str:
  106. """Save a JPEG encoded image"""
  107. raw_data = image.stream.get_rawdata()
  108. assert raw_data is not None
  109. path = path + ".jpg"
  110. with open(path, "wb") as fp:
  111. if LITERAL_DEVICE_CMYK in image.colorspace:
  112. try:
  113. from PIL import Image, ImageChops # type: ignore[import]
  114. except ImportError:
  115. raise ImportError(PIL_ERROR_MESSAGE)
  116. ifp = BytesIO(raw_data)
  117. i = Image.open(ifp)
  118. i = ImageChops.invert(i)
  119. i = i.convert("RGB")
  120. i.save(fp, "JPEG")
  121. else:
  122. fp.write(raw_data)
  123. return path
  124. def _save_jpeg2000(image: LTImage, path: str) -> str:
  125. """Save a JPEG 2000 encoded image"""
  126. raw_data = image.stream.get_rawdata()
  127. assert raw_data is not None
  128. path = path + ".png"
  129. try:
  130. from PIL import Image # type: ignore[import]
  131. except ImportError:
  132. raise ImportError(PIL_ERROR_MESSAGE)
  133. # if we just write the raw data, most image programs
  134. # that I have tried cannot open the file. However,
  135. # open and saving with PIL produces a file that
  136. # seems to be easily opened by other programs
  137. ifp = BytesIO(raw_data)
  138. i = Image.open(ifp)
  139. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  140. cv2.imwrite(path, opencv_image)
  141. return path
  142. def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
  143. texts = []
  144. images = []
  145. # 读取PDF文件并提取页面
  146. for page_number, page_layout in enumerate(extract_pages(pdf_path)):
  147. title_index = 0
  148. image_index = 0
  149. for element in page_layout:
  150. if isinstance(element, LTLine):
  151. pass
  152. elif isinstance(element, LTRect):
  153. pass
  154. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  155. text = element.get_text().strip()
  156. # # 假设标题通常是一行且字体较大
  157. if text and (is_title(text) or element.height > 15):
  158. texts.append({'index': title_index, 'pageno': page_number, 'bbox': element.bbox, 'text': text})
  159. title_index += 1
  160. elif isinstance(element, LTFigure):
  161. for e_obj in element._objs:
  162. if isinstance(e_obj, LTImage):
  163. # 提取图片数据
  164. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  165. image_file = export_image(e_obj, image_file)
  166. images.append(image_file)
  167. print(f'Image saved: {image_file}')
  168. image_index += 1
  169. with open(title_path, 'a', encoding='utf-8') as fp:
  170. json.dump(texts, fp, indent=4, ensure_ascii=False)
  171. if __name__ == '__main__':
  172. pdf_path = './投标文件-修改版9-5-1-1.pdf'
  173. title_path = './投标文件-修改版9-5-1-1.json'
  174. image_dir = './extracted_images'
  175. os.makedirs(image_dir, exist_ok=True)
  176. main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)