# -*- coding: utf-8 -*- # @Author: privacy # @Date: 2024-06-11 13:43:14 # @Last Modified by: privacy # @Last Modified time: 2024-06-11 14:10:56 # import os # from PIL import Image # from PyPDF2 import PdfReader # # 读取PDF文件 # with open(pdf_path, 'rb') as file: # reader = PdfReader(file) # num_pages = len(reader.pages) # # 遍历PDF的每一页 # for page_num in range(num_pages): # page = reader.pages[page_num] # # 提取页面中的图像 # if '/XObject' in page['/Resources']: # xobjects = page['/Resources']['/XObject'].get_object() # for obj in xobjects: # if xobjects[obj]['/Subtype'] == '/Image': # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height']) # data = xobjects[obj].get_data() # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB': # mode = "RGB" # else: # mode = "P" # img = Image.frombytes(mode, size, data) # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png') # img.save(img_path) # print(f'Image saved: {img_path}') ####################################################################### # import os # import re # import fitz # def pdf2pic(path, save_path): # checkXO = r"/Type(?= */XObject)" # checkIM = r"/Subtype(?= */Image)" # pdf = fitz.open(path) # lenXREF = pdf._getXrefLength() # imgcount = 0 # for i in range(1, lenXREF): # text = pdf._getXrefString(i) # isXObject = re.search(checkXO, text) # isImage = re.search(checkIM, text) # if not isXObject or not isImage: # continue # imgcount += 1 # pix = fitz.Pixmap(pdf, i) # new_name = f"img_{imgcount}.png" # if pix.n < 5: # pix.writePNG(os.path.join(pic_path, new_name)) # else: # pix0 = fitz.Pixmap(fitz.csRGB, pix) # pix0.writePNG(os.path.join(pic_path, new_name)) # pix0 = None # pix = None # if __name__ == '__main__': # pdf2pic(pdf_path, image_dir) ####################################################################### import os import re import json from io import BytesIO from pprint import pprint import numpy as np import cv2 from pdfminer.high_level import extract_pages from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY from pdfminer.pdfcolor import LITERAL_DEVICE_RGB from pdfminer.pdftypes import ( LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE, LITERALS_JPX_DECODE, LITERALS_FLATE_DECODE, ) import pandas as pd import pdfplumber def is_title(line: str) -> bool: title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip()) if title_word: return True title_word = re.findall('^附录|^参考文献|^附表', line.strip()) if title_word: return True return False def export_image(image: LTImage, path: str) -> str: """Save an LTImage to disk""" (width, height) = image.srcsize filters = image.stream.get_filters() if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE: name = _save_jpeg(image, path) elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE: name = _save_jpeg2000(image, path) elif image.bits == 1: name = _save_bmp(image, width, height, (width + 7) // 8, image.bits) elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: name = _save_bmp(image, width, height, width * 3, image.bits * 3) elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: name = _save_bmp(image, width, height, width, image.bits) elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: name = _save_bytes(image) else: name = _save_raw(image) return name def _save_jpeg(image: LTImage, path: str) -> str: """Save a JPEG encoded image""" raw_data = image.stream.get_rawdata() assert raw_data is not None path = path + ".jpg" with open(path, "wb") as fp: if LITERAL_DEVICE_CMYK in image.colorspace: try: from PIL import Image, ImageChops # type: ignore[import] except ImportError: raise ImportError(PIL_ERROR_MESSAGE) ifp = BytesIO(raw_data) i = Image.open(ifp) i = ImageChops.invert(i) i = i.convert("RGB") i.save(fp, "JPEG") else: fp.write(raw_data) return path def _save_jpeg2000(image: LTImage, path: str) -> str: """Save a JPEG 2000 encoded image""" raw_data = image.stream.get_rawdata() assert raw_data is not None path = path + ".png" try: from PIL import Image # type: ignore[import] except ImportError: raise ImportError(PIL_ERROR_MESSAGE) # if we just write the raw data, most image programs # that I have tried cannot open the file. However, # open and saving with PIL produces a file that # seems to be easily opened by other programs ifp = BytesIO(raw_data) i = Image.open(ifp) opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR) cv2.imwrite(path, opencv_image) return path def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None: texts = [] images = [] # 读取PDF文件并提取页面 for page_number, page_layout in enumerate(extract_pages(pdf_path)): title_index = 0 image_index = 0 for element in page_layout: if isinstance(element, LTLine): pass elif isinstance(element, LTRect): pass elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1: text = element.get_text().strip() # # 假设标题通常是一行且字体较大 if text and (is_title(text) or element.height > 15): texts.append({'index': title_index, 'pageno': page_number, 'bbox': element.bbox, 'text': text}) title_index += 1 elif isinstance(element, LTFigure): for e_obj in element._objs: if isinstance(e_obj, LTImage): # 提取图片数据 image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}') image_file = export_image(e_obj, image_file) images.append(image_file) pprint(f'Image saved: {image_file}') image_index += 1 with open(title_path, 'w', encoding='utf-8') as fp: json.dump(texts, fp, indent=4, ensure_ascii=False) def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json') -> list: tables = [] df = pd.read_json(title_path) start_page_number = df[df['text'] == start_title].pageno.max() end_page_number = df[df['text'] == end_title].pageno.max() pdf = pdfplumber.open(pdf_path) for i in range(start_page_number, end_page_number): table = pdf.pages[i].extract_table() if table: first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]] # pprint(first) if len(set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)'}) & set(first)) > 2: # pprint("找到大量表头元素,判断为独立表头,生成新表!") tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1}) elif ((i-1) in tables[-1]['pagenos']) and (len(first) == tables[-1]['col_len']): # pprint("有空列,不是单独表,直接合并") tables[-1]['pagenos'].append(i) tables[-1]['table'].extend(table) else: tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0}) with open(table_path, 'w', encoding='utf-8') as fp: json.dump(tables, fp, indent=4, ensure_ascii=False) return tables if __name__ == '__main__': pdf_path = './投标文件-修改版9-5-1-1.pdf' title_path = './投标文件-修改版9-5-1-1.json' image_dir = './extracted_images' os.makedirs(image_dir, exist_ok=True) main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir) tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')