get_tender_info.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. '''招标文件内容提取'''
  2. import pandas as pd
  3. import numpy as np
  4. import pdfplumber
  5. import json
  6. import os
  7. import re
  8. import cv2
  9. from io import BytesIO
  10. from pdfminer.layout import LTRect, LTTextBoxHorizontal, LTLine, LTFigure, LTCurve, LTImage, LTChar
  11. from pdfminer.high_level import extract_pages
  12. from pdfminer.pdfcolor import LITERAL_DEVICE_CMYK
  13. from pdfminer.pdftypes import (
  14. LITERALS_DCT_DECODE,
  15. LITERALS_JBIG2_DECODE,
  16. LITERALS_JPX_DECODE,
  17. LITERALS_FLATE_DECODE,
  18. )
  19. from pprint import pprint
  20. from pdfminer.pdfparser import PDFParser, PDFSyntaxError
  21. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  22. import pdfplumber
  23. import camelot
  24. from .tools import RefPageNumberResolver
  25. HEADERS = set({'序号', '项目编码', '项目名称', '项目特征', '单位', '工程量', '全费用综合单价', '合价', '备注', '主材名称', '规格型号', '不低于下列同档次品牌', '投标选用品牌及规格型号', '名称', '事项', '数量', '含税单价(元)', '含税合价(元)', '条款号', '评分因素', '评分标准', '页码'})
  26. HEADERS |= set({'条款号' ,'评审因素' ,'评审标准', ''})
  27. def is_title(line: str) -> bool:
  28. title_word = re.findall('^[(\(][一二三四五六七八九十]+[\))]|^\d\.|^1\d\.|^2\d\.|^[第][一二三四五六七八九十\d]+[章节条]|[一二三四五六七八九十]+[、要是]', line.strip())
  29. if title_word:
  30. return True
  31. title_word = re.findall('^附录|^参考文献|^附表', line.strip())
  32. if title_word:
  33. return True
  34. return False
  35. PIL_ERROR_MESSAGE = "PIL导入错误"
  36. def _save_jpeg(image: LTImage, path: str) -> str:
  37. """Save a JPEG encoded image"""
  38. raw_data = image.stream.get_rawdata()
  39. assert raw_data is not None
  40. path = path + ".jpg"
  41. with open(path, "wb") as fp:
  42. if LITERAL_DEVICE_CMYK in image.colorspace:
  43. try:
  44. from PIL import Image, ImageChops # type: ignore[import]
  45. except ImportError:
  46. raise ImportError(PIL_ERROR_MESSAGE)
  47. ifp = BytesIO(raw_data)
  48. i = Image.open(ifp)
  49. i = ImageChops.invert(i)
  50. i = i.convert("RGB")
  51. i.save(fp, "JPEG")
  52. else:
  53. fp.write(raw_data)
  54. return path
  55. def _save_jpeg2000(image: LTImage, path: str) -> str:
  56. """Save a JPEG 2000 encoded image"""
  57. raw_data = image.stream.get_rawdata()
  58. assert raw_data is not None
  59. path = path + ".png"
  60. try:
  61. from PIL import Image # type: ignore[import]
  62. except ImportError:
  63. raise ImportError(PIL_ERROR_MESSAGE)
  64. # 如果我们只写原始数据,我尝试过的大多数图像程序都无法打开文件。
  65. # 然而,使用OpenCV2打开和保存会生成一个文件,该文件似乎很容易被其他程序打开
  66. ifp = BytesIO(raw_data)
  67. i = Image.open(ifp)
  68. opencv_image = cv2.cvtColor(np.array(i), cv2.COLOR_RGB2BGR)
  69. cv2.imwrite(path, opencv_image)
  70. return path
  71. def export_image(image: LTImage, path: str) -> str:
  72. """Save an LTImage to disk"""
  73. (width, height) = image.srcsize
  74. filters = image.stream.get_filters()
  75. if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
  76. name = _save_jpeg(image, path)
  77. return name
  78. elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
  79. name = _save_jpeg2000(image, path)
  80. return name
  81. data = image.stream.get_data()
  82. raw_data = image.stream.get_rawdata()
  83. if data:
  84. if data[:2] == b'\xff\xd8' and data[-2:] == b'\xff\xd9':
  85. path += '.jpg'
  86. with open(path, 'wb') as file:
  87. file.write(data)
  88. return path
  89. elif data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  90. path += '.png'
  91. with open(path, 'wb') as file:
  92. file.write(data)
  93. return path
  94. elif data[:2] == b'\x42\x4d':
  95. path += '.bmp'
  96. with open(path, 'wb') as file:
  97. file.write(data)
  98. return path
  99. elif data[:6] == b'\x47\x49\x46\x38\x37\x61' or data[:6] == b'\x47\x49\x46\x38\x39\x61':
  100. path += '.gif'
  101. with open(path, 'wb') as file:
  102. file.write(data)
  103. return path
  104. elif data[:2] == b'\x4d\x4d' or data[:2] == b'\x49\x49':
  105. path += '.tiff'
  106. with open(path, 'wb') as file:
  107. file.write(data)
  108. return path
  109. else:
  110. path += '.unk'
  111. with open(path, 'wb') as file:
  112. file.write(data)
  113. return path
  114. elif raw_data:
  115. if raw_data[:2] == b'\xff\xd8' and raw_data[-2:] == b'\xff\xd9':
  116. path += '.jpg'
  117. with open(path, 'wb') as file:
  118. file.write(raw_data)
  119. return path
  120. elif raw_data[:8] == b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a':
  121. path += '.png'
  122. with open(path, 'wb') as file:
  123. file.write(raw_data)
  124. return path
  125. elif raw_data[:2] == b'\x42\x4d':
  126. path += '.bmp'
  127. with open(path, 'wb') as file:
  128. file.write(raw_data)
  129. return path
  130. elif raw_data[:6] == b'\x47\x49\x46\x38\x37\x61' or raw_data[:6] == b'\x47\x49\x46\x38\x39\x61':
  131. path += '.gif'
  132. with open(path, 'wb') as file:
  133. file.write(raw_data)
  134. return path
  135. elif raw_data[:2] == b'\x4d\x4d' or raw_data[:2] == b'\x49\x49':
  136. path += '.tiff'
  137. with open(path, 'wb') as file:
  138. file.write(raw_data)
  139. return path
  140. else:
  141. path += '.unk'
  142. with open(path, 'wb') as file:
  143. file.write(raw_data)
  144. return path
  145. else:
  146. return None
  147. def main_parse(pdf_path: str, title_path: str, image_dir: str) -> None:
  148. texts = []
  149. images = []
  150. # 读取PDF文件并提取页面
  151. for page_number, page_layout in enumerate(extract_pages(pdf_path)):
  152. title_index = 0
  153. image_index = 0
  154. for element in page_layout:
  155. if isinstance(element, LTLine):
  156. pass
  157. elif isinstance(element, LTRect):
  158. pass
  159. elif isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  160. text = element.get_text().strip()
  161. # # 假设标题通常是一行且字体较大
  162. if text and (is_title(text) or element.height > 15):
  163. texts.append({'index': title_index, 'page_number': page_number, 'bbox': element.bbox, 'text': text})
  164. title_index += 1
  165. # elif isinstance(element, LTFigure):
  166. # for e_obj in element._objs:
  167. # if isinstance(e_obj, LTImage):
  168. # # 提取图片数据
  169. # image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  170. # image_file = export_image(e_obj, image_file)
  171. # images.append(image_file)
  172. # pprint(f'Image saved: {image_file}')
  173. # image_index += 1
  174. with open(title_path, 'w', encoding='utf-8') as fp:
  175. json.dump(texts, fp, indent=4, ensure_ascii=False)
  176. return title_path,image_dir
  177. from typing import Optional, List
  178. def parse_title(file_path: str, title_path: Optional[str] = None) -> list:
  179. """
  180. 标题解析,用于报价唯一
  181. Args:
  182. title_path: 保存路径
  183. Returns:
  184. results: 标题列表
  185. """
  186. results = []
  187. seq_num = 0
  188. for page_number, page_layout in enumerate(extract_pages(file_path)):
  189. title_index = 0
  190. for element in page_layout:
  191. if isinstance(element, LTTextBoxHorizontal) and len(element._objs) == 1:
  192. text = element.get_text().strip()
  193. if text and (is_title(text) or element.height > 15):
  194. results.append({
  195. 'index': title_index,
  196. 'page_number': page_number,
  197. 'bbox': element.bbox,
  198. 'text': text,
  199. 'title': text,
  200. 'seq_num': seq_num
  201. })
  202. seq_num += 1
  203. title_index += 1
  204. if title_path:
  205. with open(title_path, 'w', encoding='utf-8') as fp:
  206. json.dump(results, fp, indent=4, ensure_ascii=False)
  207. return title_path
  208. def parse_image(file_path: str, image_dir: str, image_meta_path: str) -> List[dict]:
  209. """
  210. 解析PDF中的图片
  211. Args:
  212. image_dir: 解析目录
  213. Returns:
  214. image_list: 图片列表
  215. """
  216. image_list = []
  217. for page_number, page_layout in enumerate(extract_pages(file_path)):
  218. image_index = 0
  219. for element in page_layout:
  220. if isinstance(element, LTFigure):
  221. for e_obj in element._objs:
  222. if isinstance(e_obj, LTImage):
  223. # 提取图片数据
  224. image_file = os.path.join(image_dir, f'image_page_{page_number}_{image_index}')
  225. image_file = export_image(e_obj, image_file)
  226. image_list.append({
  227. "image_index": image_index,
  228. "page_number": page_number,
  229. "image_name": image_file
  230. })
  231. image_index += 1
  232. if image_meta_path:
  233. with open(image_meta_path, 'w', encoding='utf-8') as fp:
  234. json.dump(image_list, fp, indent=4, ensure_ascii=False)
  235. return image_meta_path
  236. def table_parse(pdf_path: str,
  237. title_path: str,
  238. start_title: str = '第三章 评标办法(综合评估法)',
  239. end_title: str = '第四章 合同条款及格式',
  240. table_path: str = None,
  241. start_page_number: int = None,
  242. end_page_number: int = None
  243. ) -> list:
  244. """pdf表格解析功能
  245. @pdf_path
  246. @title_path
  247. @start_title
  248. @end_title
  249. @table_path
  250. @start_page_number
  251. @end_page_number
  252. """
  253. tables = []
  254. if (start_page_number == None) or (end_page_number == None):
  255. df = pd.read_json(title_path)
  256. start_page_number = df[df['text'] == start_title].page_number.max()
  257. end_page_number = df[df['text'] == end_title].page_number.max()
  258. def concat_table(tables, table):
  259. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  260. @tables
  261. @table
  262. """
  263. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  264. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  265. if len(table) > 1:
  266. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  267. # pprint(first)
  268. if len(HEADERS & set(first)) > 2:
  269. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  270. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1})
  271. elif ((i-1) in tables[-1]['page_numbers']) and (len(first) == tables[-1]['col_len']):
  272. # pprint("有空列,不是单独表,直接合并")
  273. tables[-1]['page_numbers'].append(i)
  274. tables[-1]['table'].extend(table)
  275. else:
  276. tables.append({"page_numbers": [i], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0})
  277. return tables
  278. with pdfplumber.open(pdf_path) as pdf:
  279. print(start_page_number, end_page_number)
  280. for i in range(start_page_number, end_page_number):
  281. for table in pdf.pages[i].extract_tables():
  282. tables = concat_table(tables, table)
  283. with open(table_path, 'w', encoding='utf-8') as fp:
  284. json.dump(tables, fp, indent=4, ensure_ascii=False)
  285. return table_path
  286. class PdfExtractAttr_(object):
  287. def __init__(self, file_path: str):
  288. """PDF文件解析
  289. @file_path
  290. """
  291. super(PdfExtractAttr_, self).__init__()
  292. self.file_path = file_path
  293. self.details = []
  294. self.tables = []
  295. self.content = []
  296. self.chapters = []
  297. self.references = []
  298. self.detail_df = None
  299. self.outlines = None
  300. def parse_outline(self):
  301. """PDF大纲解析
  302. """
  303. results = []
  304. with open(self.file_path, "rb") as fp:
  305. try:
  306. parser = PDFParser(fp)
  307. document = PDFDocument(parser)
  308. ref_pagenum_resolver = RefPageNumberResolver(document)
  309. outlines = document.get_outlines()
  310. for (level, title, dest, a, se) in outlines:
  311. if dest:
  312. page_num = ref_pagenum_resolver.resolve(dest)
  313. elif a:
  314. page_num = ref_pagenum_resolver.resolve(a)
  315. elif se:
  316. page_num = ref_pagenum_resolver.resolve(se)
  317. else:
  318. page_num = None
  319. results.append({'level': level, 'title': title, 'page_number': page_num})
  320. except PDFNoOutlines:
  321. print("No outlines found.")
  322. except PDFSyntaxError:
  323. print("Corrupted PDF or non-PDF file.")
  324. finally:
  325. parser.close()
  326. with open('outlines.json', 'w', encoding='utf-8') as op:
  327. json.dump(results, op, indent=4, ensure_ascii=False)
  328. # print(results)
  329. def extract_content(self, content_path: str = None) -> list:
  330. with pdfplumber.open(self.file_path) as pdf:
  331. for page in pdf.pages:
  332. self.content.append({
  333. 'page_number': page.page_number - 1,
  334. 'text': page.extract_text()
  335. })
  336. with open(content_path, 'w', encoding='utf-8') as fp:
  337. json.dump(self.content, fp, indent=4, ensure_ascii=False)
  338. return content_path
  339. def parse_text(self) -> None:
  340. """文本解析
  341. """
  342. for page_number, page_layout in enumerate(extract_pages(self.file_path)):
  343. for element in page_layout:
  344. if isinstance(element, LTTextBoxHorizontal):
  345. # 距离左侧
  346. left = element.x0
  347. # 距离右侧
  348. right = (page_layout.width - element.x1)
  349. # 距离上侧
  350. top = (page_layout.height - element.y1)
  351. # 距离下侧
  352. button = element.y0
  353. # 文本宽度
  354. width = element.width
  355. if (left > right) and (abs(left - right) > 100):
  356. alignment = 'right'
  357. elif (left > 100) and (abs(left - right) < 50) and ((abs(left - right) / width) < 0.5):
  358. alignment = 'center'
  359. else:
  360. alignment = 'left'
  361. self.details.append({
  362. 'page_number': page_number,
  363. 'index': element.index,
  364. 'x0': element.bbox[0],
  365. 'y0': element.bbox[1],
  366. 'x1': element.bbox[2],
  367. 'y1': element.bbox[3],
  368. 'alignment': alignment,
  369. 'lines': len(element._objs),
  370. 'text': element.get_text().strip(),
  371. 'is_table_name': element.get_text().strip().endswith('表')
  372. })
  373. self.detail_df = pd.DataFrame(self.details)
  374. def concat_table(self, table: list, page_number: int, table_name: str = None, new: bool = False) -> None:
  375. """尝试将表添加到结果列中,有两种情况,直接添加一个新表;拼接最后一个表
  376. @table
  377. """
  378. first = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[0]]
  379. if new:
  380. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  381. return
  382. tail = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[-1]]
  383. if len(table) > 1:
  384. second = [''.join([i for i in cell.split() if i]) if cell else cell for cell in table[1]]
  385. else:
  386. second = None
  387. # pprint(first)
  388. if not self.tables or len(HEADERS & set(first)) > 2:
  389. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  390. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 1, "table_name": table_name if table_name else ""})
  391. elif second and (len(HEADERS & set(second)) > 2):
  392. # pprint("找到大量表头元素,判断为独立表头,生成新表!")
  393. if not table_name:
  394. first = [i for i in first if i]
  395. if len(first) == 1:
  396. table_name = "".join(first)
  397. self.tables.append({"page_numbers": [page_number], "title_len": len(second), "col_len": len(table[-1]), "table": table[1:], "confidence": 1, "table_name": table_name if table_name else ""})
  398. elif ((page_number-1) in self.tables[-1]['page_numbers']) and (len(first) == self.tables[-1]['col_len']):
  399. # pprint("有空列,不是单独表,直接合并")
  400. self.tables[-1]['page_numbers'].append(page_number)
  401. self.tables[-1]['table'].extend(table)
  402. else:
  403. self.tables.append({"page_numbers": [page_number], "title_len": len(first), "col_len": len(table[-1]), "table": table, "confidence": 0, "table_name": table_name if table_name else ""})
  404. def parse_table(self) -> None:
  405. """表格解析
  406. """
  407. with pdfplumber.open(self.file_path) as pdf:
  408. for page_number, page_layout in enumerate(pdf.pages):
  409. # 查询是否存在表格
  410. tables = page_layout.find_tables()
  411. # 检测到该页面存在一个表格,对其进行合并判断
  412. if len(tables) == 1:
  413. table = tables[0]
  414. x0, y0, x1, y1 = table.bbox
  415. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  416. if table_title_df.empty:
  417. self.concat_table(table.extract(), page_number=page_number)
  418. else:
  419. table_title_name = table_title_df.iloc[0]['text']
  420. self.concat_table(table.extract(), page_number=page_number, table_name=table_title_name)
  421. table = tables[0]
  422. #self.concat_table(table.extract(), table_title_name)
  423. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  424. elif len(tables) > 1:
  425. pass
  426. def parse_table_pro(self, table_path: str = 'all_tables.json') -> str:
  427. """表格解析
  428. """
  429. if self.detail_df == None:
  430. self.parse_text()
  431. with pdfplumber.open(self.file_path) as pdf:
  432. for page_number, page_layout in enumerate(pdf.pages):
  433. # 查询是否存在表格
  434. tables = page_layout.find_tables()
  435. if not tables:
  436. continue
  437. tables_pro = camelot.read_pdf(
  438. self.file_path,
  439. # flavor='stream',
  440. pages=str(page_number+1),
  441. # edge_tol=200,
  442. )
  443. if not tables_pro:
  444. continue
  445. print(len(tables), len(tables_pro))
  446. # 检测到该页面存在一个表格,对其进行合并判断
  447. if (len(tables) != 0) and (len(tables_pro) == 1):
  448. print(f"解析PDF{page_number}页的表格")
  449. # print(f"解析PDF{page_number}页的表格")
  450. table = tables[0]
  451. table_pro = tables_pro[0].df.to_dict(orient='split')['data']
  452. x0, y0, x1, y1 = table.bbox
  453. table_title_df = self.detail_df.query(f''' page_number == {page_number} and is_table_name == True and alignment == "center" ''')
  454. if table_title_df.empty:
  455. self.concat_table(table_pro, page_number=page_number)
  456. else:
  457. table_title_name = table_title_df.iloc[0]['text']
  458. self.concat_table(table_pro, page_number=page_number, table_name=table_title_name)
  459. table = tables[0]
  460. # 检测到存在多个表格,对第一个表格进行合并判断之后的表格一定不相干
  461. elif len(tables_pro) > 1:
  462. print(f"解析PDF{page_number}页的表格")
  463. first_table = tables_pro[0]
  464. self.concat_table(first_table.df.to_dict(orient='split')['data'], page_number=page_number)
  465. for table_index in range(1, len(tables_pro)):
  466. self.concat_table(tables_pro[table_index].df.to_dict(orient='split')['data'], page_number=page_number, new=True)
  467. with open(table_path, 'w', encoding='utf-8') as fp:
  468. json.dump(self.tables, fp, indent=4, ensure_ascii=False)
  469. return table_path
  470. if __name__ == '__main__':
  471. # pdf_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.pdf'
  472. # image_dir = 'data/预审查数据/extracted_images'
  473. # title_path = 'data/预审查数据/基于物联网技术的三峡坝区智慧仓储研究与建设招标文件-发出.json'
  474. # pdf_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.pdf'
  475. # image_dir = 'data/预审查数据/extracted_images'
  476. # title_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
  477. # os.makedirs(image_dir, exist_ok=True)
  478. # main_parse(pdf_path=pdf_path, title_path=title_path, image_dir=image_dir)
  479. # table_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
  480. # content_path = '/mnt/d/Work_PWS/财报素材/财报素材/财报素材/600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告.json'
  481. # agent = PdfExtractAttr_(file_path=pdf_path)
  482. ## agent.extract_content(content_path=content_path)
  483. # contents = agent.output_()
  484. # agent.parse_text()
  485. # agent.parse_table()
  486. ## agent.parse_table_pro(table_path=table_path)
  487. # all_tables = agent.output()
  488. import glob
  489. dir_path = 'data/财报素材'
  490. for pdf_path in glob.glob(f'{dir_path}/*.pdf'):
  491. print(pdf_path)
  492. if '600000_20241031_上海浦东发展银行股份有限公司2024年第三季度报告' not in pdf_path: continue
  493. agent = PdfExtractAttr_(file_path=pdf_path)
  494. content_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_content.json'
  495. agent.extract_content(content_path=content_path)
  496. table_path = f'{dir_path}/{pdf_path.split("/")[-1].split(".")[0]}_table.json'
  497. agent.parse_table_pro(table_path=table_path)