pdf_miner.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. ### 解析所有pdf文件并提取信息进行测试的框架
  2. ### PdfExtractAttr作为提取pdf信息的基类
  3. # 子类在其基础上实现匹配功能
  4. # 标准包导入
  5. import os
  6. import re
  7. import json
  8. import re
  9. import shutil
  10. import pandas as pd
  11. import pdb
  12. import base64
  13. from io import BytesIO
  14. from pprint import pprint
  15. # 第三方包导入
  16. import numpy as np
  17. import pandas as pd
  18. import cv2
  19. import torch
  20. import glob
  21. import logging
  22. import requests
  23. import time
  24. import datetime
  25. from tqdm import tqdm
  26. from tools import RefPageNumberResolver
  27. from get_info import PdfExtractAttr
  28. from get_info import is_title, export_image, _save_jpeg, _save_jpeg2000, _save_bmp, main_parse, table_parse, load_json
  29. from PIL import Image
  30. from pdfminer.image import ImageWriter
  31. # tools function
  32. def create_logger(log_path):
  33. """
  34. 将日志输出到日志文件和控制台
  35. """
  36. logger = logging.getLogger()
  37. logger.setLevel(logging.INFO)
  38. formatter = logging.Formatter(
  39. '%(asctime)s - %(levelname)s - %(message)s')
  40. # 创建一个handler,用于写入日志文件
  41. file_handler = logging.FileHandler(
  42. filename=log_path, mode='w')
  43. file_handler.setFormatter(formatter)
  44. file_handler.setLevel(logging.INFO)
  45. logger.addHandler(file_handler)
  46. # 创建一个handler,用于将日志输出到控制台
  47. console = logging.StreamHandler()
  48. console.setLevel(logging.DEBUG)
  49. console.setFormatter(formatter)
  50. logger.addHandler(console)
  51. return logger
  52. # ocr外部接口
  53. class OcrAgent():
  54. def __init__(self, url):
  55. self.url = url
  56. self.datetime_re = r'\d{4}年\d{1,2}月\d{1,2}日至(?:\d{4}年\d{1,2}月\d{1,2}日|长期)'
  57. # 不同类型证书资质正则
  58. self.re_dict = {
  59. "business_license" : r'营业执照',
  60. "deposit": r'^(?:开户许可证|[\u4e00-\u9fff]+存款账户[\u4e00-\u9fff]+)$',
  61. "production_license": r'\b[\u4e00-\u9fff]*许可证\b',
  62. "qualtifications" : r'\b[\u4e00-\u9fff]*证书',
  63. "proof": r'\b[\u4e00-\u9fff]*证明',
  64. }
  65. # 字迹阈值
  66. self.sign_threshold = 0.05
  67. # 获取图像的ocr信息
  68. def get_content(self, image_path):
  69. try:
  70. with open(image_path, 'rb') as image_file:
  71. files = {"file": ("image.jpg", image_file, "image/jpeg")}
  72. response = requests.post(self.url, files=files)
  73. return response.json()
  74. except:
  75. raise ValueError(f"传入图像{image_path}已损坏")
  76. # 移除图像上的红色印章
  77. def remove_red_seal(self, image_path):
  78. # 读取图像
  79. input_img = cv2.imread(image_path)
  80. # 分离图片的通道
  81. blue_c, green_c, red_c = cv2.split(input_img)
  82. #利用大津法自动选择阈值
  83. thresh, ret = cv2.threshold(red_c, 0, 255, cv2.THRESH_OTSU)
  84. #对阈值进行调整
  85. filter_condition = int(thresh * 1.0)
  86. #移除红色的印章
  87. _, red_thresh = cv2.threshold(red_c, filter_condition, 255, cv2.THRESH_BINARY)
  88. # 把图片转回3通道
  89. result_img = np.expand_dims(red_thresh, axis=2)
  90. result_img = np.concatenate((result_img, result_img, result_img), axis=-1)
  91. return result_img
  92. # 判断图像是否为某公司的营业执照或资质证书信息,并返回提取到的信息
  93. def judge(self, image_path: str, firm_name: str):
  94. # 以下实现要求image_path的路径如下例所示:
  95. # ./test/image_page_12_0.jpg
  96. # 12代表当前图像在pdf中的第12页
  97. # 0代表当前图像为该页提取的第1张图像
  98. image_prefix = image_path.split('/')[-1]
  99. logger.info(f'processing img: {image_prefix}')
  100. page_number = image_prefix.split('_')[-2]
  101. response_item = {
  102. "qualtified": None, # 是否为证书
  103. "matched": None, # 是否出现匹配的公司名称
  104. "license_name": None, # 证书名
  105. "license_page": page_number, # 证书所在页
  106. "start_datetime": None, # 有效起始时间
  107. "end_datetime": None # 有效终止时间
  108. }
  109. content = self.get_content(image_path=image_path)
  110. image_info = content["rawjson"]["ret"]
  111. # 必须包含公司名称信息
  112. if not self.search(image_info=image_info, key=firm_name):
  113. return None
  114. else:
  115. response_item['matched'] = True
  116. # 是否匹配营业执照或资质证书
  117. for key, format in self.re_dict.items():
  118. if key == 'business_license':
  119. match_name = self.re_match(image_info=image_info, format=format)
  120. else:
  121. match_name = self.re_search(image_info=image_info, format=format)
  122. if match_name and key == 'business_license':
  123. response_item["qualtified"] = True
  124. response_item["license_name"] = match_name
  125. response_item = self.find_license_datetime(image_info=image_info, response_item=response_item)
  126. return response_item
  127. elif match_name:
  128. response_item["qualtified"] = True
  129. response_item["license_name"] = match_name
  130. response_item = self.find_certificate_datetime(image_info=image_info, response_item=response_item)
  131. return response_item
  132. return response_item
  133. # 资质证书有效期定位
  134. def find_certificate_datetime(self, image_info, response_item):
  135. # keyword
  136. start_keywords = ['颁发日期', '发证日期', '生效日期']
  137. end_keywords = ['终止日期']
  138. priority_keywords = ['有效期', '使用期限', '有效日期']
  139. keywords_list = ['有效期', '使用期限', '有效日期', '终止日期', '颁发日期', '发证日期', '生效日期']
  140. # re format
  141. format = r'(?:[自至])?\d{4}年\d{1,2}月\d{1,2}日(?:至)?(?:\d{4}年\d{1,2}月\d{1,2}日)?'
  142. special_format = r'\d{4}-\d{1,2}-\d{1,2}'
  143. # 判断是否存在日期关键字
  144. flag = False
  145. keyword_dict = {}
  146. for info in image_info:
  147. word = info['word']
  148. left = info['rect']['left']
  149. top = info['rect']['top']
  150. width = info['rect']['width']
  151. height = info['rect']['height']
  152. for keyword in keywords_list:
  153. # 该证书存在日期关键字
  154. if keyword in word:
  155. flag = True
  156. charset_list = info['charset']
  157. for char_dc in charset_list:
  158. if char_dc['word'] == keyword[-1]:
  159. right = char_dc['rect']['left'] + char_dc['rect']['width']
  160. keyword_dict[keyword] = {
  161. "left": left,
  162. "top": top,
  163. "right": right
  164. }
  165. if flag:
  166. for info in image_info:
  167. word = info['word']
  168. if '年' in word or re.search(r'\d', word):
  169. left = info['rect']['left']
  170. top = info['rect']['top']
  171. width = info['rect']['width']
  172. if '年' in word:
  173. find_list = re.findall(pattern=format, string=word)
  174. else:
  175. find_list = re.findall(pattern=special_format, string=word)
  176. # logger.info(f'word {word} has find_list{find_list}')
  177. # if self.check:
  178. # pdb.set_trace()
  179. if len(find_list) == 1:
  180. find_string = find_list[0]
  181. if '至' in find_string:
  182. start_prefix = find_string.split('至')[0].replace('自', '')
  183. end_prefix = find_string.split('至')[-1]
  184. if '年' in start_prefix:
  185. response_item['start_datetime'] = start_prefix
  186. if end_prefix != '':
  187. response_item['end_datetime'] = end_prefix
  188. return response_item
  189. # 不存在{至}的情况下通过位置和已有期限关键字来分配日期
  190. else:
  191. for k, k_info in keyword_dict.items():
  192. k_left = k_info['left']
  193. k_right = k_info['right']
  194. k_top = k_info['top']
  195. # 捕获关键字
  196. if left == k_left:
  197. if (k in priority_keywords) or (k in end_keywords) and response_item['end_datetime'] is None:
  198. response_item['end_datetime'] = find_string
  199. elif k in start_keywords and response_item['start_datetime'] is None:
  200. response_item['start_datetime'] = find_string
  201. break
  202. elif left >= k_right and top >= k_top:
  203. if (k in priority_keywords) or (k in end_keywords) and response_item['end_datetime'] is None:
  204. response_item['end_datetime'] = find_string
  205. elif k in start_keywords and response_item['start_datetime'] is None:
  206. response_item['start_datetime'] = find_string
  207. elif len(find_list) == 2:
  208. start_prefix = find_list[0].replace('自', '')
  209. end_prefix = find_list[-1].replace('至', '')
  210. if response_item['start_datetime'] is None:
  211. response_item['start_datetime'] = start_prefix
  212. if response_item['end_datetime'] is None:
  213. response_item['end_datetime'] = end_prefix
  214. else:
  215. logger.info(f'wrong word: {word} ...')
  216. else:
  217. continue
  218. return response_item
  219. # 营业执照有效期定位
  220. def find_license_datetime(self, image_info, response_item):
  221. for info in image_info:
  222. word = info['word']
  223. # id
  224. if (word.startswith('证照编号:') and len(word) == 25) or (word.isdigit() and len(word) == 20):
  225. response_item['id'] = word if word.isdigit() else word[5:]
  226. elif bool(re.match(self.datetime_re, word)):
  227. split = word.split('至')
  228. start_datetime = split[0]
  229. end_datetime = split[-1]
  230. response_item['start_datetime'] = start_datetime
  231. response_item['end_datetime'] = end_datetime
  232. elif word == '长期':
  233. response_item['start_datetime'] = response_item['end_datetime'] = '长期'
  234. return response_item
  235. # 在image_info中搜寻word中包含key的内容
  236. def search(self, image_info, key):
  237. for info in image_info:
  238. word = info['word']
  239. if key in word:
  240. return True
  241. return False
  242. # 在image_info中使用re.search搜寻满足{format}正则的信息
  243. def re_search(self, image_info, format):
  244. for info in image_info:
  245. word = info['word']
  246. match = re.search(format, word)
  247. if match:
  248. return match.group(0)
  249. return False
  250. # 在image_info中使用re.match搜寻满足{format}正则的信息
  251. def re_match(self, image_info, format):
  252. for info in image_info:
  253. word = info['word']
  254. match = re.match(format, word)
  255. if match:
  256. return word
  257. return False
  258. # 用于识别固定位置是否有公司法人签名
  259. def signature_recognition(self, image_path: str):
  260. keywords = ['投标函', '(法定代表人CA电子印章)','(法定代表人CA电子印章或签字)', '(签字)', '法定代表人或其委托代理人:', '法定代表人:']
  261. key_pos = {}
  262. image_prefix = image_path.split('/')[0]
  263. image_name = image_path.split('/')[-1][:-4]
  264. removed_image_name = image_name + '_roi' + image_path.split('/')[-1][-4:]
  265. ink_image_name = image_name + '_ink' + image_path.split('/')[-1][-4:]
  266. removed_image_path = os.path.join(image_prefix, removed_image_name)
  267. ink_image_path = os.path.join(image_prefix, ink_image_name)
  268. if not os.path.exists(removed_image_path):
  269. removed_seal_img = self.remove_red_seal(image_path=image_path)
  270. cv2.imwrite(removed_image_name, removed_seal_img)
  271. content = self.get_content(image_path=removed_image_path)
  272. image_info = content["rawjson"]["ret"]
  273. for info in image_info:
  274. word = info['word']
  275. left = info['rect']['left']
  276. top = info['rect']['top']
  277. width = info['rect']['width']
  278. height = info['rect']['height']
  279. right = left + width
  280. bottom = top + height
  281. for keyword in keywords:
  282. if keyword in word:
  283. key_pos[keyword] = {
  284. "word": word,
  285. "left": left,
  286. "right": right,
  287. "top": top,
  288. "bottom": bottom
  289. }
  290. break
  291. # 如果不存在"投标函"、"法定代表人"等关键字,则返回False
  292. if len(key_pos) == 0:
  293. return False
  294. # 定位到法定代表人所在位置
  295. if ((key_pos.get('法定代表人:') is not None) or (key_pos.get('法定代表人或其委托代理人:') is not None)) and \
  296. ((key_pos.get('(法定代表人CA电子印章)') is not None) or (key_pos.get('(法定代表人CA电子印章或签字)') is not None) or (key_pos.get('(签字)') is not None)):
  297. if key_pos.get('法定代表人或其委托代理人:') is not None:
  298. l_info = key_pos['法定代表人或其委托代理人:']
  299. l_cnt = 13
  300. l_string = '法定代表人或其委托代理人:'
  301. else:
  302. l_info = key_pos['法定代表人:']
  303. l_cnt = 6
  304. l_string = '法定代表人:'
  305. if key_pos.get('(法定代表人CA电子印章)') is not None:
  306. r_info = key_pos['(法定代表人CA电子印章)']
  307. r_string = '(法定代表人CA电子印章)'
  308. elif key_pos.get('(法定代表人CA电子印章或签字)') is not None:
  309. r_info = key_pos['(法定代表人CA电子印章或签字)']
  310. r_string = '(法定代表人CA电子印章或签字)'
  311. else:
  312. r_info = key_pos['(签字)']
  313. r_string = '(签字)'
  314. # 此时签名应在两者之间
  315. l = l_info['right']
  316. l_word = l_info['word']
  317. r = r_info['left']
  318. r_word = r_info['word']
  319. t = max(l_info['top'], r_info['top'])
  320. b = min(l_info['bottom'], r_info['bottom']) - 5
  321. if l_word[-l_cnt:] != l_string or r_word != r_string:
  322. return True
  323. else:
  324. black_ratio = self.ink_recognition(
  325. input_img=removed_seal_img,
  326. out_path=ink_image_path,
  327. meta={
  328. "left": l,
  329. "right": r,
  330. "top": t,
  331. "bottom": b
  332. }
  333. )
  334. if black_ratio >= self.sign_threshold:
  335. return True
  336. return False
  337. elif (key_pos.get('(法定代表人CA电子印章)') is not None) or (key_pos.get('(法定代表人CA电子印章或签字)') is not None) or (key_pos.get('(签字)') is not None):
  338. # 此时签名应已包含
  339. if key_pos.get('(法定代表人CA电子印章)') is not None:
  340. key = key_pos['(法定代表人CA电子印章)']
  341. elif key_pos.get('(法定代表人CA电子印章或签字)') is not None:
  342. key = key_pos['(法定代表人CA电子印章或签字)']
  343. elif key_pos.get('(签字)') is not None:
  344. key = key_pos['(签字)']
  345. key_word = key['word']
  346. key_word = key_word.replace('(法定代表人CA电子印章)','').replace('(法定代表人CA电子印章或签字)', '').replace('(签字)','').replace('法定代表人或其委托代理人:', '').replace('法定代表人:', '')
  347. if key_word != '':
  348. return True
  349. return False
  350. elif key_pos.get('法定代表人:') is not None:
  351. # 此时签名在右边或已包含
  352. word = key_pos['法定代表人:']['word']
  353. l = key_pos['法定代表人:']['left']
  354. r = l + 100
  355. t = key_pos['法定代表人:']['top']
  356. b = key_pos['法定代表人:']['bottom'] - 5
  357. if word[-6:] != '法定代表人:':
  358. return True
  359. else:
  360. black_ratio = self.ink_recognition(
  361. input_img=removed_seal_img,
  362. out_path=ink_image_path,
  363. meta={
  364. "left": l,
  365. "right": r,
  366. "top": t,
  367. "bottom": b
  368. }
  369. )
  370. if black_ratio >= self.sign_threshold:
  371. return True
  372. return False
  373. elif key_pos.get('法定代表人或其委托代理人:') is not None:
  374. # 此时签名在右边或已包含
  375. word = key_pos['法定代表人或其委托代理人:']['word']
  376. l = key_pos['法定代表人或其委托代理人:']['left']
  377. r = l + 100
  378. t = key_pos['法定代表人或其委托代理人:']['top']
  379. b = key_pos['法定代表人或其委托代理人:']['bottom'] - 5
  380. if word[-13:] != '法定代表人或其委托代理人:':
  381. return True
  382. else:
  383. black_ratio = self.ink_recognition(
  384. input_img=removed_seal_img,
  385. meta={
  386. "left": l,
  387. "right": r,
  388. "top": t,
  389. "bottom": b
  390. }
  391. )
  392. if black_ratio >= self.sign_threshold:
  393. return True
  394. return False
  395. else:
  396. return False
  397. # 用于判断固定位置的长方形框内是否存在签名字迹
  398. # 用于识别图像固定位置黑色字迹所占比例,并将该位置的图像截取保存
  399. def ink_recognition(self, input_img, out_path, meta: dict):
  400. left = meta["left"]
  401. right = meta["right"]
  402. top = meta["top"]
  403. bottom = meta["bottom"]
  404. crop_img = input_img[top:bottom, left:right, :]
  405. cv2.imwrite(out_path, crop_img)
  406. gray_img = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
  407. thresh, ret = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)
  408. filter_condition = int(thresh * 0.90)
  409. _, black_thresh = cv2.threshold(gray_img, filter_condition, 255, cv2.THRESH_BINARY_INV)
  410. total_pixels = black_thresh.size
  411. black_pixels = np.count_nonzero(black_thresh)
  412. black_ratio = black_pixels / total_pixels
  413. return black_ratio
  414. # 提供pdf解析,并基于提取文本信息进行位置匹配
  415. class PdfMatcher(PdfExtractAttr):
  416. # file_path为提供的pdf文件路径
  417. def __init__(self, file_path: str):
  418. super(PdfMatcher, self).__init__(
  419. file_path=file_path
  420. )
  421. # 投标书名称
  422. self.bid_name = file_path.split('/')[-1][:-4]
  423. # 投标书数据文件夹
  424. self.bid_dir = os.path.join(os.path.dirname(file_path), self.bid_name)
  425. # 公司名称
  426. self.firm_name = file_path.split('/')[-2]
  427. # title list
  428. title_path = os.path.join(self.bid_dir, "title.json")
  429. # image list
  430. self.image_dir = os.path.join(self.bid_dir, "extracted_images")
  431. if (not os.path.exists(title_path)) or (not os.path.exists(self.image_dir)):
  432. os.makedirs(self.image_dir, exist_ok=True)
  433. self.main_parse(pdf_path=file_path, title_path=title_path, image_dir=self.image_dir)
  434. self.title = load_json(title_path)
  435. # outline list
  436. outline_path = os.path.join(self.bid_dir, "outlines.json")
  437. self.outline = self.parse_outline(out_path=outline_path)
  438. # text list
  439. text_path = os.path.join(self.bid_dir, "all_texts.json")
  440. self.details = self.parse_text(out_path=text_path)
  441. # table list
  442. table_path = os.path.join(self.bid_dir, "all_tables.json")
  443. if os.path.exists(table_path):
  444. self.table = load_json(table_path)
  445. else:
  446. self.tables = self.parse_table(out_path=table_path)
  447. # image format
  448. self.image_format = "image_page_{}*"
  449. # image filter threshold
  450. self.start_threshold = 10
  451. self.distance_threshold = 6
  452. self.search_threshold = 20
  453. # 用于定位营业执照、资质证书的页面范围
  454. def search_interval(self):
  455. '''定位营业执照、资质证书的区间范围'''
  456. # 通过关键字模糊定位
  457. keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
  458. search_interval = []
  459. # locate in title.json
  460. left_pos = -1 # 左指针
  461. right_pos = -1 # 右指针
  462. for title_block in self.title:
  463. block_text = title_block['text'].replace(' ', '').strip()
  464. # 先进行左区间判定
  465. if left_pos != -1 and '证书' not in block_text:
  466. right_pos = title_block['page_number']
  467. search_interval.append((left_pos, right_pos))
  468. # 重置
  469. left_pos = -1
  470. for keyword in keywords:
  471. if keyword in block_text:
  472. # 先进行模糊的outline定位
  473. center_page = None
  474. if '.' in block_text:
  475. center_page = block_text.split('.')[-1]
  476. if center_page.isdigit():
  477. center_page = eval(center_page)
  478. left_pos = min(title_block['page_number'], center_page)
  479. else:
  480. left_pos = title_block['page_number']
  481. # 最终判定
  482. if left_pos != -1:
  483. search_interval.append((left_pos, right_pos))
  484. # 重置
  485. left_pos = -1
  486. right_pos = -1
  487. # locate in outlines.json
  488. if len(self.outline) > 0:
  489. for outline_block in self.outline:
  490. if left_pos != -1:
  491. right_pos = outline_block["page_number"]
  492. right_pos = right_pos if right_pos is not None else -1
  493. search_interval.append((left_pos, right_pos))
  494. left_pos = -1
  495. outline_text = outline_block['title'].strip()
  496. for keyword in keywords:
  497. if keyword in outline_text:
  498. if outline_block["page_number"] is not None:
  499. left_pos = outline_block["page_number"]
  500. # 最终判定
  501. if left_pos != -1:
  502. search_interval.append((left_pos, right_pos))
  503. # 搜寻区间合并
  504. search_interval.sort()
  505. merge_interval = []
  506. if len(search_interval) > 0:
  507. left = -1
  508. right = -1
  509. for interval in search_interval:
  510. l, r = interval
  511. if r < l:
  512. continue
  513. if left == -1 and right == -1:
  514. left = l
  515. right = r
  516. elif l <= right:
  517. right = r
  518. else:
  519. merge_interval.append((left, right))
  520. left = l
  521. right = r
  522. merge_interval.append((left, right))
  523. return merge_interval
  524. # 返回可能为营业执照或资质证书的图像集
  525. def find_candidate_images(self):
  526. candidate_images = set()
  527. merge_intervals = self.search_interval()
  528. for interval in merge_intervals:
  529. start_page, end_page = interval
  530. if start_page <= self.start_threshold:
  531. continue
  532. if end_page == -1:
  533. end_page = start_page + 20
  534. candidate_images = self.image_regularization(start_page=max(0, start_page-self.search_threshold), end_page=end_page+self.search_threshold, candidate_images=candidate_images)
  535. candidate_images = list(candidate_images)
  536. return candidate_images
  537. # 使用正则查询符合格式的图像
  538. def image_regularization(self, start_page: int, end_page:int, candidate_images: set):
  539. for index in range(start_page, end_page + 1):
  540. current_format = self.image_format.format(index)
  541. files = glob.glob(os.path.join(self.image_dir, current_format))
  542. filter_files = [file for file in files if not file.endswith('.unk')]
  543. candidate_images.update(filter_files)
  544. return candidate_images
  545. class PdfParse_pipeline():
  546. def __init__(self,
  547. ocr, # ocr接口
  548. firm_dir, # 存储所有公司的路径
  549. out_path # 输出地址
  550. ):
  551. self.ocr = ocr
  552. self.firm_dir = firm_dir
  553. self.out_path = out_path
  554. def parse_pipeline(self):
  555. data = {}
  556. for firm_name in tqdm(os.listdir(self.firm_dir)):
  557. logger.info(f'processing firm {firm_name} ...')
  558. firm_path = os.path.join(self.firm_dir, firm_name)
  559. for bid_name in tqdm(os.listdir(firm_path)):
  560. if bid_name.endswith('.pdf'):
  561. document=os.path.join(firm_path, bid_name)
  562. bid_dir = os.path.join(firm_path, bid_name[:-4])
  563. os.makedirs(bid_dir, exist_ok=True)
  564. document_data = self.parse_single_document(pdf_path=document)
  565. data[firm_name] = document_data
  566. # 以下将data的数据存入out_path
  567. with open(self.out_path, 'w', encoding='utf-8') as f:
  568. json.dump(data, f, ensure_ascii=False, indent=4)
  569. return data
  570. def parse_single_document(self, pdf_path: str):
  571. agent = PdfMatcher(file_path=pdf_path)
  572. firm_name = agent.firm_name
  573. data = {
  574. "license_list":[]
  575. }
  576. candidate_images = agent.find_candidate_images()
  577. if len(candidate_images) == 0:
  578. pass
  579. else:
  580. for img in candidate_images:
  581. try:
  582. response = ocr.judge(image_path=img, firm_name=firm_name)
  583. if response == None or response['qualtified'] == None:
  584. continue
  585. else:
  586. data["license_list"].append({
  587. "license_name": response["license_name"],
  588. "license_path": img,
  589. "license_page": response["license_page"],
  590. "start_datetime": response["start_datetime"],
  591. "end_datetime": response["end_datetime"]
  592. })
  593. except ValueError as e:
  594. print(e)
  595. return data
  596. if __name__ == "__main__":
  597. # [测试demo]
  598. start_time = time.time()
  599. # 请针对自己的环境进行修改log_path
  600. global logger
  601. log_path = "/home/stf/miner_pdf/interface/test_logs/info.log"
  602. logger = create_logger(log_path=log_path)
  603. # [环境参数]
  604. # ocr url
  605. url = "http://120.48.103.13:18000/ctr_ocr"
  606. # seal_ocr url
  607. base_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/seal?access_token="
  608. # seal_ocr access_token
  609. access_token = "24.6bbe9987c6bd19ba65e4402917811657.2592000.1724573148.282335-86574608"
  610. # seal_ocr headers
  611. headers = {'content-type': 'application/x-www-form-urlencoded'}
  612. # data_path为存储所有投标公司的起始路径
  613. data_path = "/home/stf/miner_pdf/data/投标公司pdf"
  614. # test_data_path为存储测试投标公司的起始路径
  615. test_data_path = "/home/stf/miner_pdf/interface/test_files"
  616. # pipeline_out_path为执行所有公司pipeline逻辑后的输出位置
  617. # 其为存放营业执照和资质证书位置信息的json文件
  618. pipeline_out_path = "/home/stf/miner_pdf/interface/outdir/test_pipeline.json"
  619. # single_out_path为执行单个公司pdf解析逻辑后的输出位置
  620. # 其为存放营业执照和资质证书位置信息的json文件
  621. single_out_path = "/home/stf/miner_pdf/interface/outdir/test_single.json"
  622. # ground_truth目前为存储所有非扫描公司在pdf中营业执照与资质证书的json文件
  623. ground_truth = "/home/stf/miner_pdf/ground_truth.json"
  624. # 用于区分该公司提供的pdf文件为(扫描件 or 非扫描件)
  625. firm_excel_file = "/home/stf/miner_pdf/data/certificate.xlsx"
  626. df = pd.read_excel(firm_excel_file)
  627. # 封装好的ocr接口
  628. ocr = OcrAgent(url=url)
  629. # 封装好的pipeline
  630. pipeline = PdfParse_pipeline(
  631. ocr=ocr,
  632. firm_dir=test_data_path,
  633. out_path=single_out_path
  634. )
  635. # start
  636. data = pipeline.parse_pipeline()
  637. # caculate time cost
  638. cost_time = time.time() - start_time
  639. logger.info(f"total cost {cost_time // 60} min {cost_time % 60} sec ...")