123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251 |
- # -*- coding: utf-8 -*-
- # @Author: privacy
- # @Date: 2024-06-11 13:43:14
- # @Last Modified by: privacy
- # @Last Modified time: 2024-06-11 14:10:56
- # import os
- # from PIL import Image
- # from PyPDF2 import PdfReader
- # # 读取PDF文件
- # with open(pdf_path, 'rb') as file:
- # reader = PdfReader(file)
- # num_pages = len(reader.pages)
- # # 遍历PDF的每一页
- # for page_num in range(num_pages):
- # page = reader.pages[page_num]
- # # 提取页面中的图像
- # if '/XObject' in page['/Resources']:
- # xobjects = page['/Resources']['/XObject'].get_object()
- # for obj in xobjects:
- # if xobjects[obj]['/Subtype'] == '/Image':
- # size = (xobjects[obj]['/Width'], xobjects[obj]['/Height'])
- # data = xobjects[obj].get_data()
- # if xobjects[obj]['/ColorSpace'] == '/DeviceRGB':
- # mode = "RGB"
- # else:
- # mode = "P"
- # img = Image.frombytes(mode, size, data)
- # img_path = os.path.join(output_dir, f'image_{page_num}_{obj}.png')
- # img.save(img_path)
- # print(f'Image saved: {img_path}')
- #######################################################################
- # import os
- # import re
- # import fitz
- # def pdf2pic(path, save_path):
- # checkXO = r"/Type(?= */XObject)"
- # checkIM = r"/Subtype(?= */Image)"
- # pdf = fitz.open(path)
- # lenXREF = pdf._getXrefLength()
- # imgcount = 0
- # for i in range(1, lenXREF):
- # text = pdf._getXrefString(i)
- # isXObject = re.search(checkXO, text)
- # isImage = re.search(checkIM, text)
- # if not isXObject or not isImage:
- # continue
- # imgcount += 1
- # pix = fitz.Pixmap(pdf, i)
- # new_name = f"img_{imgcount}.png"
- # if pix.n < 5:
- # pix.writePNG(os.path.join(pic_path, new_name))
- # else:
- # pix0 = fitz.Pixmap(fitz.csRGB, pix)
- # pix0.writePNG(os.path.join(pic_path, new_name))
- # pix0 = None
- # pix = None
- # if __name__ == '__main__':
- # pdf2pic(pdf_path, image_dir)
- #######################################################################
- import os
- import re
- import json
- from io import BytesIO
- from pprint import pprint
- import numpy as np
- import cv2
- from pdfminer.high_level import extract_pages
- from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
- from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
- from pdfminer.pdfcolor import LITERAL_DEVICE_GRAY
- from pdfminer.pdfcolor import LITERAL_DEVICE_RGB
- from pdfminer.pdftypes import (
- LITERALS_DCT_DECODE,
- LITERALS_JBIG2_DECODE,
- LITERALS_JPX_DECODE,
- LITERALS_FLATE_DECODE,
- )
- import pandas as pd
- import pdfplumber
- def is_title(line: str) -> bool:
- title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
- if title_word:
- return True
- title_word = re.findall('^附录|^参考文献|^附表', line.strip())
- if title_word:
- return True
- return False
- def export_image(image: LTImage, path: str) -> str:
- """Save an LTImage to disk"""
- (width, height) = image.srcsize
- filters = image.stream.get_filters()
- if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
- name = _save_jpeg(image, path)
- elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
- name = _save_jpeg2000(image, path)
- elif image.bits == 1:
- name = _save_bmp(image, width, height, (width + 7) // 8, image.bits)
- elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
- name = _save_bmp(image, width, height, width * 3, image.bits * 3)
- elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
- name = _save_bmp(image, width, height, width, image.bits)
- elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
- name = _save_bytes(image)
- else:
- name = _save_raw(image)
- return name
- def _save_jpeg(image: LTImage, path: str) -> str:
- """Save a JPEG encoded image"""
- raw_data = image.stream.get_rawdata()
- assert raw_data is not None
- path = path + ".jpg"
- with open(path, "wb") as fp:
- if LITERAL_DEVICE_CMYK in image.colorspace:
- try:
- from PIL import Image, ImageChops # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
- ifp = BytesIO(raw_data)
- i = Image.open(ifp)
- i = ImageChops.invert(i)
- i = i.convert("RGB")
- i.save(fp, "JPEG")
- else:
- fp.write(raw_data)
- return path
- def _save_jpeg2000(image: LTImage, path: str) -> str:
- """Save a JPEG 2000 encoded image"""
- raw_data = image.stream.get_rawdata()
- assert raw_data is not None
- path = path + ".png"
- try:
- from PIL import Image # type: ignore[import]
- except ImportError:
- raise ImportError(PIL_ERROR_MESSAGE)
- # if we just write the raw data, most image programs
- # that I have tried cannot open the file. However,
- # open and saving with PIL produces a file that
- # seems to be easily opened by other programs
- ifp = BytesIO(raw_data)
- i = Image.open(ifp)
- opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
- cv2.imwrite(path, opencv_image)
- return path
- def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
- texts = []
- images = []
- # 读取PDF文件并提取页面
- for page_number, page_layout in enumerate(extract_pages(pdf_path)):
- title_index = 0
- image_index = 0
- for element in page_layout:
- if isinstance(element, LTLine):
- pass
- elif isinstance(element, LTRect):
- pass
- elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
- text = element.get_text().strip()
- # # 假设标题通常是一行且字体较大
- if text and (is_title(text) or element.height > 15):
- texts.append({'index': title_index, 'pageno': page_number, 'bbox': element.bbox, 'text': text})
- title_index += 1
- elif isinstance(element, LTFigure):
- for e_obj in element._objs:
- if isinstance(e_obj, LTImage):
- # 提取图片数据
- image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
- image_file = export_image(e_obj, image_file)
- images.append(image_file)
- pprint(f'Image saved: {image_file}')
- image_index += 1
- with open(title_path, 'w', encoding='utf-8') as fp:
- json.dump(texts, fp, indent=4, ensure_ascii=False)
- def table_parse(pdf_path: str, title_path: str, start_title: str = '六、已标价工程量清单', end_title: str = '七、施工组织设计', table_path: str = 'table.json') -> list:
- tables = []
- df = pd.read_json(title_path)
- start_page_number = df[df['text'] == start_title].pageno.max()
- end_page_number = df[df['text'] == end_title].pageno.max()
- pdf = pdfplumber.open(pdf_path)
- for i in range(start_page_number, end_page_number):
- table = pdf.pages[i].extract_table()
- if table:
- first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
- # pprint(first)
- if len(set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)'}) & set(first)) > 2:
- # pprint("找到大量表头元素,判断为独立表头,生成新表!")
- tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
- elif ((i-1) in tables[-1]['pagenos']) and (len(first) == tables[-1]['col_len']):
- # pprint("有空列,不是单独表,直接合并")
- tables[-1]['pagenos'].append(i)
- tables[-1]['table'].extend(table)
- else:
- tables.append({"pagenos": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
- with open(table_path, 'w', encoding='utf-8') as fp:
- json.dump(tables, fp, indent=4, ensure_ascii=False)
- return tables
- if __name__ == '__main__':
- pdf_path = './投标文件-修改版9-5-1-1.pdf'
- title_path = './投标文件-修改版9-5-1-1.json'
- image_dir = './extracted_images'
- os.makedirs(image_dir, exist_ok=True)
- main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
- tables = table_parse(pdf_path=pdf_path, title_path=title_path, start_title='六、已标价工程量清单', end_title = '七、施工组织设计')
|