pdf_lineation.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import sys
  2. import cv2
  3. from pdfminer.pdfparser import PDFParser
  4. from pdfminer.pdfdocument import PDFDocument
  5. from pdfminer.pdfpage import PDFPage
  6. from pdfminer.pdfpage import PDFTextExtractionNotAllowed
  7. from pdfminer.pdfinterp import PDFResourceManager
  8. from pdfminer.pdfinterp import PDFPageInterpreter
  9. from pdfminer.pdfdevice import PDFDevice
  10. from pdfminer.layout import LAParams
  11. from pdfminer.converter import PDFPageAggregator
  12. import pdfminer
  13. import numpy as np
  14. import matplotlib.pyplot as plt
  15. from pdf2image import convert_from_path
  16. image_path = sys.argv[1]
  17. layout_type = ['LTTextBox', 'LTFigure', 'LTImage', 'LTCurve', 'LTRect']
  18. # Text:红色, Figure:绿色, Image:蓝色, Curve:黄色, Rect:紫色
  19. color = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (160, 32, 240)]
  20. draw_color = dict(zip(layout_type, color))
  21. def parse_obj(lt_objs):
  22. boxs = {x: [] for x in layout_type}
  23. # loop over the object list
  24. for obj in lt_objs:
  25. if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
  26. boxs['LTTextBox'].append(obj.bbox)
  27. elif isinstance(obj, pdfminer.layout.LTFigure):
  28. boxs['LTFigure'].append(obj.bbox)
  29. elif isinstance(obj, pdfminer.layout.LTImage):
  30. boxs['LTImage'].append(obj.bbox)
  31. elif isinstance(obj, pdfminer.layout.LTCurve):
  32. boxs['LTCurve'].append(obj.bbox)
  33. elif isinstance(obj, pdfminer.layout.LTRect):
  34. boxs['LTRect'].append(obj.bbox)
  35. else:
  36. raise
  37. return boxs
  38. # Open a PDF file.
  39. fp = open(image_path, 'rb')
  40. # Create a PDF parser object associated with the file object.
  41. parser = PDFParser(fp)
  42. # Create a PDF document object that stores the document structure.
  43. # Supply the password for initialization.
  44. document = PDFDocument(parser)
  45. # Check if the document allows text extraction. If not, abort.
  46. # if not document.is_extractable:
  47. # raise PDFTextExtractionNotAllowed
  48. # Create a PDF resource manager object that stores shared resources.
  49. rsrcmgr = PDFResourceManager()
  50. # Create a PDF page aggregator object.
  51. device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
  52. interpreter = PDFPageInterpreter(rsrcmgr, device)
  53. page_boxs = []
  54. for page in PDFPage.create_pages(document):
  55. interpreter.process_page(page)
  56. # receive the LTPage object for the page.
  57. layout = device.get_result()
  58. # extract text from this object
  59. boxs = parse_obj(layout._objs)
  60. page_sized = tuple([round(i) for i in layout.bbox])
  61. page_boxs.append((page_sized, boxs))
  62. pass
  63. image = convert_from_path(image_path)
  64. assert len(image) == len(page_boxs), "The number of boxes doesn't match the number of pictures"
  65. for i in range(len(image)):
  66. # 得到这一页图片
  67. image_pil = image[i]
  68. # 把这一页的图片格式转成numpy类型
  69. image_numpy = np.array(image_pil)
  70. # 得到这一页图片德国高度,为了之后得到实际的box
  71. page_boxs_height = page_boxs[i][0][3]
  72. print(page_boxs[i][1])
  73. # 遍历这一页的框
  74. for key, values in page_boxs[i][1].items():
  75. # 把实际的图片大小resize到页面的大小
  76. image_numpy = cv2.resize(image_numpy, page_boxs[i][0][2:4], interpolation=cv2.INTER_AREA)
  77. for value in values:
  78. # The y-coordinates are given as the distance from the bottom of the page.
  79. real_box = (value[0], page_boxs_height-value[3], value[2], page_boxs_height-value[1])
  80. real_box_integer = tuple([round(jj) for jj in real_box])
  81. # 画图
  82. cv2.rectangle(image_numpy, real_box_integer[:2], real_box_integer[2:], draw_color[key], 2)
  83. plt.figure(), plt.imshow(image_numpy)
  84. plt.show()