optimize_miner.py 80 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931
  1. # 在pdf_miner的基础上进行优化
  2. # 标准包导入
  3. import os
  4. import re
  5. import json
  6. import re
  7. import shutil
  8. import pandas as pd
  9. import pdb
  10. import base64
  11. from io import BytesIO
  12. from pprint import pprint
  13. from paddleocr import PPStructure, draw_structure_result, save_structure_res
  14. from pypdf import PdfReader
  15. from pdf2image import convert_from_path
  16. # 第三方包导入
  17. import numpy as np
  18. import pandas as pd
  19. import cv2
  20. import torch
  21. import glob
  22. import logging
  23. import requests
  24. import time
  25. import datetime
  26. import subprocess
  27. from tqdm import tqdm
  28. from tooklit import RefPageNumberResolver
  29. from get_info import PdfExtractAttr
  30. from get_info import is_title, export_image, _save_jpeg, _save_jpeg2000, _save_bmp, main_parse, table_parse, load_json
  31. from PIL import Image
  32. from pdfminer.image import ImageWriter
  33. from tooklit import remove_red_seal, remove_blue_seal
  34. # tools function
  35. def create_logger(log_path):
  36. """
  37. 将日志输出到日志文件和控制台
  38. """
  39. logger = logging.getLogger()
  40. logger.setLevel(logging.INFO)
  41. formatter = logging.Formatter(
  42. '%(asctime)s - %(levelname)s - %(message)s')
  43. # 创建一个handler,用于写入日志文件
  44. file_handler = logging.FileHandler(
  45. filename=log_path, mode='w')
  46. file_handler.setFormatter(formatter)
  47. file_handler.setLevel(logging.INFO)
  48. logger.addHandler(file_handler)
  49. # 创建一个handler,用于将日志输出到控制台
  50. console = logging.StreamHandler()
  51. console.setLevel(logging.DEBUG)
  52. console.setFormatter(formatter)
  53. logger.addHandler(console)
  54. return logger
  55. # 页面信息缓存
  56. class PageBuffer():
  57. def __init__(self):
  58. self.page_cache = {}
  59. # 查询某一页的信息属性
  60. def query(self, page):
  61. if self.page_cache.get(page, -1) == -1:
  62. return None
  63. page_info = self.page_cache[page]
  64. return page_info
  65. class SealAgent():
  66. def __init__(self, url, headers):
  67. self.url = url
  68. self.headers = headers
  69. def get_content(self, image_path):
  70. f = open(image_path, 'rb')
  71. img = base64.b64encode(f.read())
  72. params = {"image":img}
  73. try:
  74. response = requests.post(url=self.url, data=params, headers=self.headers)
  75. return response.json()
  76. except:
  77. logger.info(f"当前图像:{image_path}在印章识别ocr接口中网络不稳定 ...")
  78. def seal_parse(self, image_path):
  79. meta = {
  80. "firm_seals": [],
  81. "indiv_seals": []
  82. }
  83. content = self.get_content(image_path=image_path)
  84. seal_num = content["result_num"]
  85. seal_result = content["result"]
  86. if seal_num == 0:
  87. return meta
  88. for seal_info in seal_result:
  89. seal_type = seal_info["type"]
  90. seal_content = seal_info["major"]["words"].strip().replace(' ', '')
  91. top = seal_info["location"]["top"]
  92. left = seal_info["location"]["left"]
  93. width = seal_info["location"]["width"]
  94. height = seal_info["location"]["height"]
  95. if '公司' in seal_content:
  96. meta['firm_seals'].append(
  97. {
  98. "seal_type": seal_type,
  99. "firm_name": seal_content
  100. }
  101. )
  102. else:
  103. meta['indiv_seals'].append({
  104. "seal_type": seal_type,
  105. "indiv_name": seal_content
  106. })
  107. return meta
  108. # ocr外部接口
  109. class OcrAgent():
  110. def __init__(self, url):
  111. self.url = url
  112. self.datetime_re = r'\d{4}年\d{1,2}月\d{1,2}日至(?:\d{4}年\d{1,2}月\d{1,2}日|长期)'
  113. # 不同类型证书资质正则
  114. self.re_dict = {
  115. "business_license" : r'营业执照',
  116. "deposit": r'^(?:开户许可证|[\u4e00-\u9fff]+存款账户[\u4e00-\u9fff]+)$',
  117. "production_license": r'\b[\u4e00-\u9fff]*许可证\b',
  118. "qualtifications" : r'\b[\u4e00-\u9fff]*证书',
  119. "proof": r'\b[\u4e00-\u9fff]*证明',
  120. }
  121. # 字迹阈值
  122. self.sign_threshold = 0.05
  123. self.font_threshold = 39
  124. # 集成印章ocr
  125. def integrate_sealagent(self, url, headers):
  126. self.sealagent = SealAgent(url=url, headers=headers)
  127. # 获取图像的ocr信息
  128. def get_content(self, image_path):
  129. try:
  130. with open(image_path, 'rb') as image_file:
  131. files = {"file": ("image.jpg", image_file, "image/jpeg")}
  132. response = requests.post(self.url, files=files)
  133. return response.json()
  134. except:
  135. raise ValueError(f"传入图像{image_path}已损坏")
  136. def judge_pro(self, image_path: str, firm_name: str):
  137. # 以下实现要求image_path的路径如下例所示:
  138. # ./test/page-0.jpg
  139. image_name = image_path.split('/')[-1]
  140. logger.info(f'processing img: {image_name}')
  141. page_number = image_name.split('-')[-1].split('.')[0]
  142. response_item = {
  143. "qualtified": None, # 是否为证书
  144. "matched": None, # 是否出现匹配的公司名称
  145. "license_name": None, # 证书名
  146. "license_page": page_number, # 证书所在页
  147. "start_datetime": None, # 有效起始时间
  148. "end_datetime": None # 有效终止时间
  149. }
  150. content = self.get_content(image_path=image_path)
  151. image_info = content["rawjson"]["ret"]
  152. # 必须包含公司名称信息
  153. if not self.search(image_info=image_info, key_list=[firm_name]):
  154. return None
  155. else:
  156. response_item['matched'] = True
  157. # 是否匹配营业执照或资质证书
  158. for key, format in self.re_dict.items():
  159. if key == 'business_license':
  160. match_name = self.re_match(image_info=image_info, format=format)
  161. else:
  162. match_name = self.re_search(image_info=image_info, format=format)
  163. if match_name and key == 'business_license':
  164. response_item["qualtified"] = True
  165. response_item["license_name"] = match_name
  166. response_item = self.find_license_datetime(image_info=image_info, response_item=response_item)
  167. return response_item
  168. elif match_name:
  169. response_item["qualtified"] = True
  170. response_item["license_name"] = match_name
  171. response_item = self.find_certificate_datetime(image_info=image_info, response_item=response_item)
  172. return response_item
  173. return response_item
  174. # 判断图像是否为某公司的营业执照或资质证书信息,并返回提取到的信息
  175. def judge(self, image_path: str, firm_name: str):
  176. # 以下实现要求image_path的路径如下例所示:
  177. # ./test/image_page_12_0.jpg
  178. # 12代表当前图像在pdf中的第12页
  179. # 0代表当前图像为该页提取的第1张图像
  180. image_prefix = image_path.split('/')[-1]
  181. logger.info(f'processing img: {image_prefix}')
  182. page_number = image_prefix.split('_')[-2]
  183. response_item = {
  184. "qualtified": None, # 是否为证书
  185. "matched": None, # 是否出现匹配的公司名称
  186. "license_name": None, # 证书名
  187. "license_page": page_number, # 证书所在页
  188. "start_datetime": None, # 有效起始时间
  189. "end_datetime": None # 有效终止时间
  190. }
  191. content = self.get_content(image_path=image_path)
  192. image_info = content["rawjson"]["ret"]
  193. # 必须包含公司名称信息
  194. if not self.search(image_info=image_info, key=firm_name):
  195. return None
  196. else:
  197. response_item['matched'] = True
  198. # 是否匹配营业执照或资质证书
  199. for key, format in self.re_dict.items():
  200. if key == 'business_license':
  201. match_name = self.re_match(image_info=image_info, format=format)
  202. else:
  203. match_name = self.re_search(image_info=image_info, format=format)
  204. if match_name and key == 'business_license':
  205. response_item["qualtified"] = True
  206. response_item["license_name"] = match_name
  207. response_item = self.find_license_datetime(image_info=image_info, response_item=response_item)
  208. return response_item
  209. elif match_name:
  210. response_item["qualtified"] = True
  211. response_item["license_name"] = match_name
  212. response_item = self.find_certificate_datetime(image_info=image_info, response_item=response_item)
  213. return response_item
  214. return response_item
  215. # 资质证书有效期定位
  216. def find_certificate_datetime(self, image_info, response_item):
  217. # keyword
  218. start_keywords = ['颁发日期', '发证日期', '生效日期']
  219. end_keywords = ['终止日期']
  220. priority_keywords = ['有效期', '使用期限', '有效日期']
  221. keywords_list = ['有效期', '使用期限', '有效日期', '终止日期', '颁发日期', '发证日期', '生效日期']
  222. # re format
  223. format = r'(?:[自至])?\d{4}年\d{1,2}月\d{1,2}日(?:至)?(?:\d{4}年\d{1,2}月\d{1,2}日)?'
  224. special_format = r'\d{4}-\d{1,2}-\d{1,2}'
  225. # 判断是否存在日期关键字
  226. flag = False
  227. keyword_dict = {}
  228. for info in image_info:
  229. word = info['word']
  230. left = info['rect']['left']
  231. top = info['rect']['top']
  232. width = info['rect']['width']
  233. height = info['rect']['height']
  234. for keyword in keywords_list:
  235. # 该证书存在日期关键字
  236. if keyword in word:
  237. flag = True
  238. charset_list = info['charset']
  239. for char_dc in charset_list:
  240. if char_dc['word'] == keyword[-1]:
  241. right = char_dc['rect']['left'] + char_dc['rect']['width']
  242. keyword_dict[keyword] = {
  243. "left": left,
  244. "top": top,
  245. "right": right
  246. }
  247. if flag:
  248. for info in image_info:
  249. word = info['word']
  250. if '年' in word or re.search(r'\d', word):
  251. left = info['rect']['left']
  252. top = info['rect']['top']
  253. width = info['rect']['width']
  254. if '年' in word:
  255. find_list = re.findall(pattern=format, string=word)
  256. else:
  257. find_list = re.findall(pattern=special_format, string=word)
  258. # logger.info(f'word {word} has find_list{find_list}')
  259. # if self.check:
  260. # pdb.set_trace()
  261. if len(find_list) == 1:
  262. find_string = find_list[0]
  263. if '至' in find_string:
  264. start_prefix = find_string.split('至')[0].replace('自', '')
  265. end_prefix = find_string.split('至')[-1]
  266. if '年' in start_prefix:
  267. response_item['start_datetime'] = start_prefix
  268. if end_prefix != '':
  269. response_item['end_datetime'] = end_prefix
  270. return response_item
  271. # 不存在{至}的情况下通过位置和已有期限关键字来分配日期
  272. else:
  273. for k, k_info in keyword_dict.items():
  274. k_left = k_info['left']
  275. k_right = k_info['right']
  276. k_top = k_info['top']
  277. # 捕获关键字
  278. if left == k_left:
  279. if (k in priority_keywords) or (k in end_keywords) and response_item['end_datetime'] is None:
  280. response_item['end_datetime'] = find_string
  281. elif k in start_keywords and response_item['start_datetime'] is None:
  282. response_item['start_datetime'] = find_string
  283. break
  284. elif left >= k_right and top >= k_top:
  285. if (k in priority_keywords) or (k in end_keywords) and response_item['end_datetime'] is None:
  286. response_item['end_datetime'] = find_string
  287. elif k in start_keywords and response_item['start_datetime'] is None:
  288. response_item['start_datetime'] = find_string
  289. elif len(find_list) == 2:
  290. start_prefix = find_list[0].replace('自', '')
  291. end_prefix = find_list[-1].replace('至', '')
  292. if response_item['start_datetime'] is None:
  293. response_item['start_datetime'] = start_prefix
  294. if response_item['end_datetime'] is None:
  295. response_item['end_datetime'] = end_prefix
  296. else:
  297. logger.info(f'wrong word: {word} ...')
  298. else:
  299. continue
  300. return response_item
  301. # 营业执照有效期定位
  302. def find_license_datetime(self, image_info, response_item):
  303. for info in image_info:
  304. word = info['word']
  305. # id
  306. if (word.startswith('证照编号:') and len(word) == 25) or (word.isdigit() and len(word) == 20):
  307. response_item['id'] = word if word.isdigit() else word[5:]
  308. elif bool(re.match(self.datetime_re, word)):
  309. split = word.split('至')
  310. start_datetime = split[0]
  311. end_datetime = split[-1]
  312. response_item['start_datetime'] = start_datetime
  313. response_item['end_datetime'] = end_datetime
  314. elif word == '长期':
  315. response_item['start_datetime'] = response_item['end_datetime'] = '长期'
  316. return response_item
  317. # 在目录中找到正文pos右侧对应的数字标签
  318. def digit_label(self, image_info, pos: dict):
  319. gold_left = pos['left']
  320. gold_right = pos['right']
  321. gold_top = pos['top']
  322. gold_bottom = pos['bottom']
  323. # 判断字符串中是否包含数字
  324. def contain_digit(word):
  325. for c in word:
  326. if c.isdigit():
  327. return True
  328. return False
  329. mini_distance = 10000
  330. mini_word = ""
  331. for info in image_info:
  332. word = info['word']
  333. left = info['rect']['left']
  334. top = info['rect']['top']
  335. width = info['rect']['width']
  336. height = info['rect']['height']
  337. right = left + width
  338. bottom = top + height
  339. if contain_digit(word=word) and left >= gold_left:
  340. distance = abs(top - gold_top)
  341. if distance < mini_distance:
  342. mini_distance = distance
  343. mini_word = word
  344. # 提取最终的mini_word
  345. label_page = None
  346. if '.' in mini_word:
  347. label_page = mini_word.split('.')[-1]
  348. elif mini_word.isdigit():
  349. label_page = mini_word
  350. return label_page
  351. # 在image_info中搜寻word中包含key_list的内容,并打包信息返回
  352. def pack_search(self, image_info, key_list):
  353. meta = []
  354. for info in image_info:
  355. word = info['word'].strip().replace(' ', '')
  356. left = info['rect']['left']
  357. top = info['rect']['top']
  358. width = info['rect']['width']
  359. height = info['rect']['height']
  360. right = left + width
  361. bottom = top + height
  362. for key in key_list:
  363. if key in word:
  364. meta.append({
  365. "word": word,
  366. "contain_key": key,
  367. "bbox": {
  368. "left": left,
  369. "right": right,
  370. "top": top,
  371. "bottom": bottom,
  372. "width": width,
  373. "height": height
  374. }
  375. })
  376. return meta
  377. # 在image_info中搜寻word中包含key_list的内容
  378. def search(self, image_info, key_list):
  379. for info in image_info:
  380. word = info['word'].strip().replace(' ', '')
  381. for key in key_list:
  382. if key in word:
  383. return True
  384. return False
  385. # 精确匹配key_list中的内容
  386. def exact_search(self, image_info, key_list):
  387. meta = []
  388. for info in image_info:
  389. word = info['word'].strip().replace(' ', '')
  390. for key in key_list:
  391. if key == word:
  392. height = info['rect']['height']
  393. meta.append({
  394. "keyword": word,
  395. "font_size": height
  396. })
  397. return meta
  398. # 在image_info中使用re.search搜寻满足{format}正则的信息
  399. def re_search(self, image_info, format):
  400. for info in image_info:
  401. word = info['word']
  402. match = re.search(format, word)
  403. if match:
  404. return match.group(0)
  405. return False
  406. # 在image_info中使用re.match搜寻满足{format}正则的信息
  407. def re_match(self, image_info, format):
  408. for info in image_info:
  409. word = info['word']
  410. match = re.match(format, word)
  411. if match:
  412. return word
  413. return False
  414. # 用于识别固定位置是否有公司法人签名或公司盖章
  415. def signature_recognition(self, image_path: str):
  416. # 先调用接口判断公司盖章
  417. meta = self.sealagent.seal_parse(image_path=image_path)
  418. if len(meta["firm_seals"]) > 0 or len(meta["indiv_seals"]) > 0:
  419. logger.info("检测到当前页面具备印章 ...")
  420. return True
  421. keywords = ['投标函', '(法定代表人CA电子印章)','(法定代表人CA电子印章或签字)', '(签字)', '法定代表人或其委托代理人:', '法定代表人:']
  422. key_pos = {}
  423. image_prefix = os.path.dirname(image_path)
  424. image_name = image_path.split('/')[-1][:-4]
  425. removed_red_image_name = image_name + '_red_roi' + image_path.split('/')[-1][-4:]
  426. removed_blue_image_name = image_name + '_blue_roi' + image_path.split('/')[-1][-4:]
  427. red_ink_image_name = image_name + '_red_ink' + image_path.split('/')[-1][-4:]
  428. blue_ink_image_name = image_name + '_blue_ink' + image_path.split('/')[-1][-4:]
  429. removed_red_image_path = os.path.join(image_prefix, removed_red_image_name)
  430. removed_blue_image_path = os.path.join(image_prefix, removed_blue_image_name)
  431. red_ink_image_path = os.path.join(image_prefix, red_ink_image_name)
  432. blue_ink_image_path = os.path.join(image_prefix, blue_ink_image_name)
  433. if not os.path.exists(removed_red_image_path):
  434. removed_red_seal_img = remove_red_seal(image_path=image_path)
  435. cv2.imwrite(removed_red_image_path, removed_red_seal_img)
  436. else:
  437. removed_red_seal_img = cv2.imread(removed_red_image_path)
  438. if not os.path.exists(removed_blue_image_path):
  439. removed_blue_seal_img = remove_blue_seal(image_path=image_path)
  440. cv2.imwrite(removed_blue_image_path, removed_blue_seal_img)
  441. else:
  442. removed_blue_seal_img = cv2.imread(removed_blue_image_path)
  443. red_content = self.get_content(image_path=removed_red_image_path)
  444. red_image_info = red_content["rawjson"]["ret"]
  445. blue_content = self.get_content(image_path=removed_blue_image_path)
  446. blue_image_info = blue_content["rawjson"]["ret"]
  447. def identify(image_info, input_img, out_path):
  448. for info in image_info:
  449. word = info['word'].replace(' ', '')
  450. left = info['rect']['left']
  451. top = info['rect']['top']
  452. width = info['rect']['width']
  453. height = info['rect']['height']
  454. right = left + width
  455. bottom = top + height
  456. for keyword in keywords:
  457. if keyword in word:
  458. key_pos[keyword] = {
  459. "word": word,
  460. "left": left,
  461. "right": right,
  462. "top": top,
  463. "bottom": bottom
  464. }
  465. break
  466. # 如果不存在"投标函"、"法定代表人"等关键字,则返回False
  467. if len(key_pos) == 0:
  468. return False
  469. # 定位到法定代表人所在位置
  470. # import pdb; pdb.set_trace()
  471. if ((key_pos.get('法定代表人:') is not None) or (key_pos.get('法定代表人或其委托代理人:') is not None)) and \
  472. ((key_pos.get('(法定代表人CA电子印章)') is not None) or (key_pos.get('(法定代表人CA电子印章或签字)') is not None) or (key_pos.get('(签字)') is not None)):
  473. if key_pos.get('法定代表人或其委托代理人:') is not None:
  474. l_info = key_pos['法定代表人或其委托代理人:']
  475. l_cnt = 13
  476. l_string = '法定代表人或其委托代理人:'
  477. else:
  478. l_info = key_pos['法定代表人:']
  479. l_cnt = 6
  480. l_string = '法定代表人:'
  481. if key_pos.get('(法定代表人CA电子印章)') is not None:
  482. r_info = key_pos['(法定代表人CA电子印章)']
  483. r_string = '(法定代表人CA电子印章)'
  484. elif key_pos.get('(法定代表人CA电子印章或签字)') is not None:
  485. r_info = key_pos['(法定代表人CA电子印章或签字)']
  486. r_string = '(法定代表人CA电子印章或签字)'
  487. else:
  488. r_info = key_pos['(签字)']
  489. r_string = '(签字)'
  490. # 此时签名应在两者之间
  491. l = l_info['right']
  492. l_word = l_info['word']
  493. r = r_info['left']
  494. r_word = r_info['word']
  495. t = max(l_info['top'], r_info['top'])
  496. b = min(l_info['bottom'], r_info['bottom']) - 5
  497. if l_word[-l_cnt:] != l_string or r_word != r_string:
  498. return True
  499. else:
  500. black_ratio = self.ink_recognition(
  501. input_img=input_img,
  502. out_path=out_path,
  503. meta={
  504. "left": l,
  505. "right": r,
  506. "top": t,
  507. "bottom": b
  508. }
  509. )
  510. if black_ratio >= self.sign_threshold:
  511. return True
  512. return False
  513. elif (key_pos.get('(法定代表人CA电子印章)') is not None) or (key_pos.get('(法定代表人CA电子印章或签字)') is not None) or (key_pos.get('(签字)') is not None):
  514. # 此时签名应已包含
  515. if key_pos.get('(法定代表人CA电子印章)') is not None:
  516. key = key_pos['(法定代表人CA电子印章)']
  517. elif key_pos.get('(法定代表人CA电子印章或签字)') is not None:
  518. key = key_pos['(法定代表人CA电子印章或签字)']
  519. elif key_pos.get('(签字)') is not None:
  520. key = key_pos['(签字)']
  521. key_word = key['word']
  522. key_word = key_word.replace('(法定代表人CA电子印章)','').replace('(法定代表人CA电子印章或签字)', '').replace('(签字)','').replace('法定代表人或其委托代理人:', '').replace('法定代表人:', '')
  523. if key_word != '':
  524. return True
  525. return False
  526. elif key_pos.get('法定代表人:') is not None:
  527. # 此时签名在右边或已包含
  528. word = key_pos['法定代表人:']['word']
  529. l = key_pos['法定代表人:']['left']
  530. r = l + 100
  531. t = key_pos['法定代表人:']['top']
  532. b = key_pos['法定代表人:']['bottom'] - 5
  533. if word[-6:] != '法定代表人:':
  534. return True
  535. else:
  536. black_ratio = self.ink_recognition(
  537. input_img=input_img,
  538. out_path=out_path,
  539. meta={
  540. "left": l,
  541. "right": r,
  542. "top": t,
  543. "bottom": b
  544. }
  545. )
  546. if black_ratio >= self.sign_threshold:
  547. return True
  548. return False
  549. elif key_pos.get('法定代表人或其委托代理人:') is not None:
  550. # 此时签名在右边或已包含
  551. word = key_pos['法定代表人或其委托代理人:']['word']
  552. l = key_pos['法定代表人或其委托代理人:']['left']
  553. r = l + 100
  554. t = key_pos['法定代表人或其委托代理人:']['top']
  555. b = key_pos['法定代表人或其委托代理人:']['bottom'] - 5
  556. if word[-13:] != '法定代表人或其委托代理人:':
  557. return True
  558. else:
  559. black_ratio = self.ink_recognition(
  560. input_img=input_img,
  561. out_path=out_path,
  562. meta={
  563. "left": l,
  564. "right": r,
  565. "top": t,
  566. "bottom": b
  567. }
  568. )
  569. if black_ratio >= self.sign_threshold:
  570. return True
  571. return False
  572. else:
  573. return False
  574. return identify(red_image_info, removed_red_seal_img, red_ink_image_path) \
  575. or identify(blue_image_info, removed_blue_seal_img, blue_ink_image_path)
  576. # 用于判断固定位置的长方形框内是否存在签名字迹
  577. # 用于识别图像固定位置黑色字迹所占比例,并将该位置的图像截取保存
  578. def ink_recognition(self, input_img, out_path, meta: dict):
  579. left = meta["left"]
  580. right = meta["right"]
  581. top = meta["top"]
  582. bottom = meta["bottom"]
  583. crop_img = input_img[top:bottom, left:right, :]
  584. cv2.rectangle(input_img, (left, top), (right, bottom), (255, 255, 0), 2) # 绿色框,线宽为2
  585. test_path = out_path[:-4] + '*' + out_path[-4:]
  586. if crop_img is None or crop_img.size == 0:
  587. logger.info("Error: crop_img is empty")
  588. return 0.0
  589. else:
  590. cv2.imwrite(out_path, crop_img)
  591. cv2.imwrite(test_path, input_img)
  592. gray_img = cv2.cvtColor(crop_img, cv2.COLOR_BGR2GRAY)
  593. thresh, ret = cv2.threshold(gray_img, 0, 255, cv2.THRESH_OTSU)
  594. filter_condition = int(thresh * 0.90)
  595. _, black_thresh = cv2.threshold(gray_img, filter_condition, 255, cv2.THRESH_BINARY_INV)
  596. total_pixels = black_thresh.size
  597. black_pixels = np.count_nonzero(black_thresh)
  598. black_ratio = black_pixels / total_pixels
  599. return black_ratio
  600. # 用于判别字体大小
  601. def font_judge(self, kw_search_meta):
  602. if len(kw_search_meta) == 0:
  603. # 即未搜寻到关键字,非相关页
  604. return False
  605. for meta in kw_search_meta:
  606. keyword = meta["keyword"]
  607. font_size = meta["font_size"]
  608. logger.info(f"keyword:{keyword} has font_size: {font_size}")
  609. if font_size >= self.font_threshold:
  610. return True
  611. # 基于paddlepaddle的table ocr接口
  612. def table_parse(self, image_path: str, save_folder: str = ''):
  613. table_engine = PPStructure(show_log=True)
  614. img = cv2.imread(image_path)
  615. result = table_engine(img)
  616. expectation = {
  617. "table": {
  618. "title": [],
  619. "title_confidence": [],
  620. "content": [],
  621. "content_confidence": [],
  622. },
  623. "figure": {
  624. "content": [],
  625. "content_confidence": [],
  626. "caption": [],
  627. "caption_confidence": [],
  628. },
  629. "page_numbers": [],
  630. "others": []
  631. }
  632. for res in result:
  633. if res['type'] == 'title' or res['type'] == 'table_caption':
  634. if len(res['res']) > 0:
  635. expectation['table']['title_confidence'].append(res['res'][0]['confidence'])
  636. expectation['table']['title'].append(res['res'][0]['text'])
  637. elif res['type'] == 'table':
  638. expectation['table']['content_confidence'].append(res['score'])
  639. expectation['table']['content'].append(pd.read_html(res['res']['html'])[0].values.tolist())
  640. elif res['type'] == 'figure':
  641. expectation['figure']['content_confidence'].append(res['score'])
  642. expectation['figure']['content'].append(res['res'])
  643. elif res['type'] == 'figure_caption':
  644. expectation['figure']['caption_confidence'].append(res['score'])
  645. expectation['figure']['caption'].append(res['res'])
  646. else:
  647. expectation['others'].append(res)
  648. if save_folder:
  649. # 存储为save_folder/save_name
  650. save_structure_res(result, save_folder, os.path.basename(image_path).split('.')[0])
  651. return expectation
  652. # 提供pdf解析,并基于提取文本信息进行位置匹配
  653. class PdfMatcher(PdfExtractAttr):
  654. # file_path为提供的pdf文件路径
  655. def __init__(self, file_path: str):
  656. super(PdfMatcher, self).__init__(
  657. file_path=file_path
  658. )
  659. # 投标书路径
  660. self.document = file_path
  661. # 投标书名称
  662. self.bid_name = file_path.split('/')[-1][:-4]
  663. # 投标书数据文件夹
  664. self.bid_dir = os.path.join(os.path.dirname(file_path), self.bid_name)
  665. # 公司名称
  666. self.firm_name = file_path.split('/')[-2]
  667. # title list
  668. title_path = os.path.join(self.bid_dir, "title.json")
  669. # image list
  670. # self.image_dir = os.path.join(self.bid_dir, "extracted_images")
  671. # if (not os.path.exists(title_path)) or (not os.path.exists(self.image_dir)):
  672. # os.makedirs(self.image_dir, exist_ok=True)
  673. if not os.path.exists(title_path):
  674. self.main_parse(pdf_path=file_path, title_path=title_path)
  675. # self.main_parse(pdf_path=file_path, title_path=title_path, image_dir=self.image_dir)
  676. self.title = load_json(title_path)
  677. # outline list
  678. outline_path = os.path.join(self.bid_dir, "outlines.json")
  679. self.outline = self.parse_outline(out_path=outline_path)
  680. # text list
  681. text_path = os.path.join(self.bid_dir, "all_texts.json")
  682. self.details = self.parse_text(out_path=text_path)
  683. # table list
  684. table_path = os.path.join(self.bid_dir, "all_tables.json")
  685. if os.path.exists(table_path):
  686. self.table = load_json(table_path)
  687. else:
  688. self.table = self.parse_table_pro(table_path=table_path)
  689. # image format
  690. # self.image_format = "image_page_{}*"
  691. # image filter threshold
  692. self.start_threshold = 10
  693. self.distance_threshold = 6
  694. self.search_threshold = 20
  695. # total pages
  696. self.total_pages = self.count_pages()
  697. # 证书正则
  698. self.license_dict = {
  699. "business_license" : r'营业执照',
  700. "deposit": r'^(?:开户许可证|[\u4e00-\u9fff]+存款账户[\u4e00-\u9fff]+)$',
  701. "production_license": r'\b[\u4e00-\u9fff]*许可证\b',
  702. "qualtifications" : r'\b[\u4e00-\u9fff]*证书',
  703. "proof": r'\b[\u4e00-\u9fff]*证明',
  704. }
  705. # 在title中找寻包含keyword的信息
  706. # digit_limit表明是否使用数字限制
  707. def search_in_title(self, keyword, digit_limit=False):
  708. meta = []
  709. digits = "一二三四五六七八九十"
  710. for title_block in self.title:
  711. block_text = title_block['text'].replace(' ', '').strip()
  712. if digit_limit:
  713. if keyword in block_text:
  714. # 确保keyword左右不包含digit中的内容
  715. cnt = block_text.find(keyword)
  716. length = len(keyword)
  717. check_left = cnt - 1
  718. check_right = cnt + length
  719. if (check_left >= 0 and block_text[check_left] in digits) or (check_right < len(block_text) and block_text[check_right] in digits):
  720. continue
  721. else:
  722. if keyword in block_text:
  723. meta.append({
  724. "page_number": title_block["page_number"],
  725. "text": block_text
  726. })
  727. return meta
  728. # 在outline中找寻包含keywords的信息
  729. def search_in_outline(self, keyword):
  730. meta = []
  731. for outline_block in self.outline:
  732. block_text = outline_block['text'].replace(' ', '').strip()
  733. if keyword in block_text:
  734. meta.append({
  735. "page_number": outline_block["page_number"],
  736. "text": block_text
  737. })
  738. return meta
  739. # 用于定位营业执照、资质证书的页面范围
  740. def search_license_interval(self, necessity_interval=None):
  741. '''定位营业执照、资质证书的区间范围'''
  742. # 通过关键字模糊定位
  743. keywords = ['资格审查资料','资格审查材料','其它材料','其他材料','其他资料','附件', '影印件']
  744. search_interval = []
  745. license_pages = []
  746. # locate in title.json
  747. left_pos = -1 # 左指针
  748. right_pos = -1 # 右指针
  749. for title_block in self.title:
  750. block_text = title_block['text'].replace(' ', '').strip()
  751. # TODO 先进行证书正则判断
  752. '''
  753. for key, format in self.license_dict.items():
  754. match = re.search(format, block_text)
  755. if match:
  756. license_pages.append(title_block['page_number'])
  757. '''
  758. # 先进行左区间判定
  759. if left_pos != -1 and '证书' not in block_text:
  760. right_pos = title_block['page_number']
  761. search_interval.append((left_pos, right_pos))
  762. # 重置
  763. left_pos = -1
  764. for keyword in keywords:
  765. if keyword in block_text:
  766. # 先进行模糊的outline定位
  767. center_page = None
  768. if '.' in block_text:
  769. center_page = block_text.split('.')[-1]
  770. if center_page.isdigit():
  771. center_page = eval(center_page)
  772. left_pos = min(title_block['page_number'], center_page)
  773. else:
  774. left_pos = title_block['page_number']
  775. # 最终判定
  776. if left_pos != -1:
  777. search_interval.append((left_pos, right_pos))
  778. # 重置
  779. left_pos = -1
  780. right_pos = -1
  781. # locate in outlines.json
  782. if len(self.outline) > 0:
  783. for outline_block in self.outline:
  784. if left_pos != -1:
  785. right_pos = outline_block["page_number"]
  786. right_pos = right_pos if right_pos is not None else -1
  787. search_interval.append((left_pos, right_pos))
  788. left_pos = -1
  789. outline_text = outline_block['title'].strip()
  790. for keyword in keywords:
  791. if keyword in outline_text:
  792. if outline_block["page_number"] is not None:
  793. left_pos = outline_block["page_number"]
  794. # 最终判定
  795. if left_pos != -1:
  796. search_interval.append((left_pos, right_pos))
  797. if necessity_interval is not None:
  798. search_interval += necessity_interval
  799. # 搜寻区间合并
  800. search_interval.sort()
  801. logger.info(f"search_interval: {search_interval} ...")
  802. merge_interval = []
  803. if len(search_interval) > 0:
  804. left = -1
  805. right = -1
  806. for interval in search_interval:
  807. l, r = interval
  808. if r < l:
  809. continue
  810. # 初始化
  811. if left == -1 and right == -1:
  812. left = l
  813. right = r
  814. elif l <= right and r > right:
  815. right = r
  816. elif l <= right:
  817. continue
  818. else:
  819. merge_interval.append((left, right))
  820. left = l
  821. right = r
  822. merge_interval.append((left, right))
  823. return merge_interval
  824. # 用于定位相关业绩的页面范围
  825. def search_perf_info(self, ):
  826. flag = False
  827. keywords = ['资格审查资料','资格审查材料']
  828. meta = {
  829. "perf_page_number": -1,
  830. "qual_page_number": set(),
  831. "table": None
  832. }
  833. # 先从表格数据中查询是否直接提取到相关业绩表信息
  834. for table_block in self.table:
  835. page_number = table_block["page_numbers"]
  836. table_name = table_block["table_name"]
  837. table_name = table_name.strip().replace("\n", "").replace(" ", "")
  838. if ('类似' in table_name) and (('项目' in table_name) or ('业绩' in table_name)):
  839. flag = True
  840. meta["perf_page_number"] = page_number
  841. meta["table"] = table_block["table"]
  842. break
  843. if flag:
  844. return meta
  845. # 从outlines中模糊匹配
  846. for outline_block in self.outline:
  847. page_number = outline_block["page_number"]
  848. text = outline_block["title"]
  849. text = text.strip().replace("\n", "").replace(" ", "")
  850. for keyword in keywords:
  851. if keyword in text:
  852. qual_page = page_number
  853. meta["qual_page_number"].add(qual_page)
  854. if ('类似' in text) and (('项目' in text) or ('业绩' in text)):
  855. flag = True
  856. meta["perf_page_number"] = page_number
  857. break
  858. if flag:
  859. return meta
  860. # 从title中模糊匹配
  861. for title_block in self.title:
  862. page_number = title_block["page_number"]
  863. text = title_block["text"]
  864. text = text.strip().replace("\n", "").replace(" ", "")
  865. for keyword in keywords:
  866. if keyword in text:
  867. qual_page = page_number
  868. meta["qual_page_number"].add(qual_page)
  869. if ('类似' in text) and (('项目' in text) or ('业绩' in text)):
  870. flag = True
  871. meta["perf_page_number"] = page_number
  872. break
  873. return meta
  874. # 返回可能为营业执照或资质证书的图像集
  875. def find_candidate_images(self):
  876. candidate_images = set()
  877. merge_intervals = self.search_license_interval()
  878. logger.info(f"merge_intervals: {merge_intervals}")
  879. for interval in merge_intervals:
  880. start_page, end_page = interval
  881. if start_page <= self.start_threshold:
  882. continue
  883. if end_page == -1:
  884. end_page = start_page + 20
  885. candidate_images = self.image_regularization(start_page=max(0, start_page-self.search_threshold), end_page=end_page+self.search_threshold, candidate_images=candidate_images)
  886. candidate_images = list(candidate_images)
  887. return candidate_images
  888. # 使用正则查询符合格式的图像
  889. def image_regularization(self, start_page: int, end_page:int, candidate_images: set):
  890. for index in range(start_page, end_page + 1):
  891. current_format = self.image_format.format(index)
  892. files = glob.glob(os.path.join(self.image_dir, current_format))
  893. filter_files = [file for file in files if not file.endswith('.unk')]
  894. candidate_images.update(filter_files)
  895. return candidate_images
  896. # 返回可能为营业执照或资质证书的pdf2img图像集
  897. def find_candidate_images_pro(self, necessity_interval=None):
  898. scanned_dir = self.pdf2img()
  899. candidate_images = set()
  900. merge_intervals = self.search_license_interval(necessity_interval=necessity_interval)
  901. logger.info(f"merge_intervals: {merge_intervals}")
  902. for interval in merge_intervals:
  903. start_page, end_page = interval
  904. if start_page <= self.start_threshold:
  905. continue
  906. if end_page == -1:
  907. end_page = start_page + 20
  908. for index in range(start_page, end_page + 1):
  909. img_path = os.path.join(scanned_dir, f'page-{index}.jpg')
  910. processed_img_path = os.path.join(scanned_dir, f'page-{index}_red_roi.jpg')
  911. if os.path.exists(img_path) and (not os.path.exists(processed_img_path)):
  912. processed_img = remove_red_seal(image_path=img_path)
  913. cv2.imwrite(processed_img_path, processed_img)
  914. candidate_images.add(img_path)
  915. candidate_images.add(processed_img_path)
  916. candidate_images = list(candidate_images)
  917. return candidate_images
  918. # 在表格数据中查询是否提取到投标报价表的数据
  919. def find_bid_quotation_form(self):
  920. keywords = ["投标报价总表", "投标报价汇总表"]
  921. key_column = '增值税金额'
  922. tables = []
  923. flag = False
  924. for table_block in self.table:
  925. page_number = table_block["page_numbers"]
  926. table_name = table_block["table_name"]
  927. table_name = table_name.replace(' ', '')
  928. # 根据关键词找寻table
  929. for keyword in keywords:
  930. if keyword in table_name:
  931. tables = table_block["table"]
  932. flag = True
  933. break
  934. # 再根据关键列名找寻table
  935. if len(tables) == 0:
  936. column_num = len(table_block["table"])
  937. cnt = 0
  938. while cnt < column_num:
  939. column_list = table_block["table"][cnt]
  940. for column_name in column_list:
  941. if column_name is not None:
  942. column_name = column_name.replace("\n", "").replace(" ", "").strip()
  943. if key_column in column_name:
  944. tables = table_block["table"]
  945. flag = True
  946. break
  947. if '其中' in column_name:
  948. cnt += 1
  949. if (not cnt) or flag:
  950. break
  951. if flag:
  952. break
  953. # 当前表格中存在投标报价表的信息
  954. if flag:
  955. parsed_table = self.extract_table(table=tables)
  956. return page_number, parsed_table
  957. # 当前表格中不存在投标报价表的信息
  958. return None
  959. # 在表格数据中查询是否提取到拟投入本项目人员配备情况表 or 项目管理机构组成表的数据
  960. def find_itempeople_form(self):
  961. keywords = ['拟投入本项目人员配备情况表', '项目管理机构组成表', '项目管理机构成员', '项目管理组成表']
  962. flag = False # 标记是否通过table_name查询到表格
  963. meta = {
  964. "candidate_page": set(),
  965. "table_list": [],
  966. }
  967. for table_block in self.table:
  968. if len(table_block["table"]) == 0:
  969. continue
  970. page_number = table_block["page_numbers"]
  971. table_name = table_block["table_name"]
  972. table_name = table_name.strip().replace("\n", "").replace(" ", "")
  973. for keyword in keywords:
  974. if keyword in table_name:
  975. meta["table_list"].append({
  976. "page_number":page_number,
  977. "table": table_block["table"]
  978. })
  979. flag = True
  980. break
  981. if flag:
  982. return meta
  983. column_name_list = table_block["table"][0]
  984. for column_name in column_name_list:
  985. if column_name is not None:
  986. column_name = column_name.strip().replace("\n", "").replace(" ", "")
  987. if '职务' in column_name or '职称' in column_name:
  988. meta["table_list"].append({
  989. "page_number":page_number,
  990. "table": table_block["table"]
  991. })
  992. break
  993. sec_keywords = ['拟投入本项目人员配备情况表', '项目管理机构', '项目管理机构组成表']
  994. # 在outlines中定位项目管理机构等位置
  995. for outline_block in self.outline:
  996. page_number = outline_block["page_number"]
  997. text = outline_block["title"]
  998. text = text.strip().replace("\n", "").replace(" ", "")
  999. for sec_keyword in sec_keywords:
  1000. if sec_keyword in text:
  1001. if '.' in text:
  1002. page = text.split('.')[-1]
  1003. if page.isdigit():
  1004. page = eval(page)
  1005. else:
  1006. page = page_number
  1007. meta["candidate_page"].add(page)
  1008. # 在titles中定位项目管理机构等位置
  1009. for title_block in self.title:
  1010. page_number = title_block["page_number"]
  1011. text = title_block["text"]
  1012. text = text.strip().replace("\n", "").replace(" ", "")
  1013. for sec_keyword in sec_keywords:
  1014. if sec_keyword in text:
  1015. if '.' in text:
  1016. page = text.split('.')[-1]
  1017. if page.isdigit():
  1018. page = eval(page)
  1019. else:
  1020. page = page_number
  1021. meta["candidate_page"].add(page)
  1022. return meta
  1023. # 用于解析提取到的表格信息
  1024. def extract_table(self, table):
  1025. row_num = len(table)
  1026. if row_num == 0:
  1027. return [], []
  1028. column_num = len(table[0])
  1029. new_table = []
  1030. # first step: 完善列名
  1031. cnt = 0 # 从第一行开始
  1032. column_list = []
  1033. while len(column_list) < column_num and cnt < row_num:
  1034. current_column_list = table[cnt]
  1035. for column_name in current_column_list:
  1036. column_name = str(column_name).strip().replace("\n", "").replace(" ", "")
  1037. if (column_name != None) and ('其中' not in column_name) and (column_name not in column_list):
  1038. column_list.append(column_name)
  1039. if len(column_list) < column_num:
  1040. cnt += 1
  1041. # second step: 填入表格
  1042. new_table.append(column_list)
  1043. for i in range(cnt + 1, row_num):
  1044. tmp = []
  1045. for j in range(column_num):
  1046. element = table[i][j]
  1047. tmp.append(element)
  1048. new_table.append(tmp)
  1049. return column_list, new_table
  1050. # 查询pdf总页数
  1051. def count_pages(self):
  1052. reader = PdfReader(self.file_path)
  1053. return len(reader.pages)
  1054. # 用于自动创建pdf->image的scanned文件夹
  1055. def pdf2img(self):
  1056. scanned_dir = os.path.join(self.bid_dir, 'scanned')
  1057. if os.path.exists(scanned_dir):
  1058. logger.info(f"检测到当前投标文件{self.bid_dir}存在扫描文件夹 ...")
  1059. else:
  1060. os.makedirs(scanned_dir, exist_ok=True)
  1061. logger.info(f"开始转换pdf2img页面")
  1062. convert_start_time = time.time()
  1063. try:
  1064. images = convert_from_path(pdf_path=self.document)
  1065. for i, image in enumerate(images):
  1066. image.save(os.path.join(scanned_dir, f'page-{i}.jpg'), 'JPEG')
  1067. logger.info("convert successfully !")
  1068. except subprocess.CalledProcessError as e:
  1069. logger.info(f"convert failure: {e}")
  1070. convert_cost_time = time.time() - convert_start_time
  1071. logger.info(f"转化pdf2img花费{convert_cost_time // 60} min {convert_cost_time % 60} sec ...")
  1072. return scanned_dir
  1073. class PdfParse_pipeline():
  1074. def __init__(self,
  1075. ocr, # ocr接口
  1076. firm_dir, # 存储所有公司的路径
  1077. out_path, # 输出地址
  1078. ):
  1079. self.ocr = ocr
  1080. self.firm_dir = firm_dir
  1081. self.out_path = out_path
  1082. def parse_pipeline(self):
  1083. data = {}
  1084. for firm_name in tqdm(os.listdir(self.firm_dir)):
  1085. logger.info(f'processing firm {firm_name} ...')
  1086. firm_path = os.path.join(self.firm_dir, firm_name)
  1087. for bid_name in tqdm(os.listdir(firm_path)):
  1088. if bid_name.endswith('.pdf'):
  1089. document=os.path.join(firm_path, bid_name)
  1090. bid_dir = os.path.join(firm_path, bid_name[:-4])
  1091. os.makedirs(bid_dir, exist_ok=True)
  1092. document_data = self.parse_single_document(pdf_path=document)
  1093. data[firm_name] = document_data
  1094. # 以下将data的数据存入out_path
  1095. with open(self.out_path, 'w', encoding='utf-8') as f:
  1096. json.dump(data, f, ensure_ascii=False, indent=4)
  1097. return data
  1098. def parse_single_document(self, pdf_path: str):
  1099. agent = PdfMatcher(file_path=pdf_path)
  1100. firm_name = agent.firm_name
  1101. total_pages = agent.total_pages
  1102. data = {
  1103. "necessity_interval": [],
  1104. # 投标函中是否有签字 or 盖章
  1105. "has_signature_or_seal": False,
  1106. "formatting_img": None,
  1107. # 资质证书 & 营业执照信息
  1108. "license_list":[],
  1109. # 投标报价汇总表
  1110. "bid_form": None,
  1111. # 相关业绩表
  1112. "perf_info": [],
  1113. # 项目经理相关信息
  1114. "manager": [],
  1115. "kw_meta": {}
  1116. }
  1117. logger.info("start finding the kw info in directory ...")
  1118. kw_meta = self.find_kw_from_dc(agent=agent, data=data, total_pages=total_pages)
  1119. logger.info("start processing the nextiter information ...")
  1120. # iter = self.parse_nextiter(agent=agent, data=data, total_pages=total_pages)
  1121. # for signature or seal
  1122. logger.info("start judging the signature & seal information ...")
  1123. # self.parse_bid(agent=agent, data=data, total_pages=total_pages)
  1124. # for license_list
  1125. logger.info("start finding license information ...")
  1126. # self.parse_license(agent=agent, data=data, iter=iter, firm_name=firm_name)
  1127. # for bid_form
  1128. logger.info("start finding bid form ...")
  1129. # self.parse_bid_form(agent=agent, data=data)
  1130. # for perf information
  1131. logger.info("start finding perf information ...")
  1132. # self.parse_perf(agent=agent, data=data)
  1133. # for manager
  1134. logger.info("start finding manager information ...")
  1135. self.parse_manager(agent=agent, data=data, kw_meta=kw_meta["manager"])
  1136. return data
  1137. # 从目录中查询是否存在关键词以及该关键字对应页码
  1138. def find_kw_from_dc(self, agent, data, total_pages):
  1139. meta = {}
  1140. keywords = {
  1141. "manager": ['拟投入本项目人员配备情况表', '项目管理机构组成表', '项目管理机构成员', '项目管理组成表']
  1142. }
  1143. # 初始化
  1144. for kw in keywords:
  1145. meta[kw] = []
  1146. scanned_dir = agent.pdf2img()
  1147. # 目录一般位于前20页
  1148. start = 0
  1149. end = 20 if total_pages > 20 else total_pages
  1150. is_enter = False
  1151. for index in range(start, end):
  1152. logger.info(f"find kw from index {index} ...")
  1153. img_path = os.path.join(scanned_dir, f'page-{index}.jpg')
  1154. processed_img_path = os.path.join(scanned_dir, f'page-{index}_red_roi.jpg')
  1155. # 去除红章
  1156. if not os.path.exists(processed_img_path):
  1157. processed_img = remove_red_seal(image_path=img_path)
  1158. cv2.imwrite(processed_img_path, processed_img)
  1159. # 对处理过红章的页面进行ocr
  1160. content = self.ocr.get_content(image_path=processed_img_path)
  1161. image_info = content["rawjson"]["ret"]
  1162. if not is_enter and self.ocr.search(image_info, '目录'):
  1163. # 当前为目录页面首页,标记is_enter
  1164. is_enter = True
  1165. # 已经进入目录页面
  1166. if is_enter:
  1167. # 整体搜寻关键字
  1168. for kw, elements in keywords.items():
  1169. pack_info = self.ocr.pack_search(image_info=image_info, key_list=elements)
  1170. logger.info(pack_info)
  1171. # 找出对应数值标签
  1172. if len(pack_info) > 0:
  1173. for info in pack_info:
  1174. word = info["word"]
  1175. contain_key = info["contain_key"]
  1176. pos = info["bbox"]
  1177. # 如果word中包含了页码
  1178. if word[-1].isdigit():
  1179. label_page = word.split('.')[-1]
  1180. meta[kw].append(
  1181. {
  1182. "element": contain_key,
  1183. "word": word,
  1184. "label_page": label_page
  1185. }
  1186. )
  1187. else:
  1188. meta[kw].append(
  1189. {
  1190. "element": contain_key,
  1191. "word": word,
  1192. "label_page": self.ocr.digit_label(image_info=image_info, pos=pos)
  1193. }
  1194. )
  1195. data["kw_meta"] = meta
  1196. return meta
  1197. def parse_nextiter(self, agent, data, total_pages):
  1198. # 目录一般都会带有关键字:目录
  1199. keyword = '目录'
  1200. # 需要定位下一章的关键字
  1201. iter_keywords = {
  1202. '1': ['资格审查资料', '资格审查材料'],
  1203. '2': ['其他材料', '其它材料', '其他资料', '其它资料'],
  1204. '3': ['附件'],
  1205. '4': ['影印件']
  1206. }
  1207. index_keywords = {
  1208. '1': ['一、', '二、', '三、', '四、', '五、', '六、', '七、', '八、', '九、'],
  1209. '2': ['一章', '二章', '三章', '四章', '五章', '六章', '七章', '八章', '九章', '十章']
  1210. }
  1211. # 找寻下一层级
  1212. def find_next(current_index):
  1213. logger.info(f"processing current_index: {current_index}")
  1214. cycle = {
  1215. "一": "二",
  1216. "二": "三",
  1217. "三": "四",
  1218. "四": "五",
  1219. "五": "六",
  1220. "六": "七",
  1221. "七": "八",
  1222. "八": "九",
  1223. "九": "十",
  1224. "十": "二",
  1225. }
  1226. if current_index.isdigit():
  1227. next_index = str(eval(current_index) + 1)
  1228. return next_index
  1229. next_index = ""
  1230. # 涉及进位
  1231. if len(current_index) == 1:
  1232. if current_index in cycle.keys():
  1233. if current_index == "十":
  1234. next_index = "十一"
  1235. else:
  1236. next_index = cycle[current_index]
  1237. else:
  1238. raise ValueError(f"筛选current index {current_index} 有误 ...")
  1239. return next_index
  1240. if current_index[-1] == '九':
  1241. if current_index[0] in cycle.keys():
  1242. next_index = cycle[current_index[0]] + '十'
  1243. else:
  1244. return ""
  1245. elif current_index[-1] == '十':
  1246. next_index = current_index + '一'
  1247. else:
  1248. if current_index[-1] in cycle.keys():
  1249. next_index = current_index[:-1] + cycle[current_index[-1]]
  1250. else:
  1251. return ""
  1252. return next_index
  1253. # 用于提取字符串的当前层级,并返回下一层级
  1254. def refine(string: str):
  1255. digit_keywords = "123456789一二三四五六七八九十"
  1256. string = string.strip().replace(' ', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
  1257. flag = False
  1258. for digit_kw in digit_keywords:
  1259. if digit_kw in string:
  1260. flag = True
  1261. if not flag:
  1262. return ""
  1263. if '、' in string and '章' in string:
  1264. index_string = string.split('、')[0]
  1265. current_index = ""
  1266. next_index = ""
  1267. is_start = False
  1268. for c in index_string:
  1269. if c == "第":
  1270. is_start = True
  1271. elif (not is_start) and c in digit_keywords:
  1272. is_start = True
  1273. current_index += c
  1274. elif c == "章":
  1275. next_index = find_next(current_index)
  1276. elif is_start and c in digit_keywords:
  1277. current_index += c
  1278. return next_index
  1279. if '、' in string:
  1280. index_string = string.split('、')[0]
  1281. next_index = find_next(index_string)
  1282. return next_index
  1283. if '章' in string and '第' in string:
  1284. l = string.find('第')
  1285. r = string.find('章')
  1286. index_string = string[l+1:r]
  1287. next_index = find_next(index_string)
  1288. return next_index
  1289. return ""
  1290. # 传入当前keyword的bounding box,返回其对应的index
  1291. def find_ocr_index(image_info, bbox: dict):
  1292. meta = {}
  1293. candidate_distance = 10000
  1294. candidate_word = ""
  1295. keywords = "123456789一二三四五六七八九十"
  1296. match_left = bbox['left']
  1297. match_right = bbox['right']
  1298. match_top = bbox['top']
  1299. match_bottom = bbox['bottom']
  1300. for info in image_info:
  1301. word = info['word'].replace(' ', '')
  1302. left = info['rect']['left']
  1303. top = info['rect']['top']
  1304. width = info['rect']['width']
  1305. height = info['rect']['height']
  1306. right = left + width
  1307. bottom = top + height
  1308. for keyword in keywords:
  1309. if keyword in word and left < match_left and right < match_right:
  1310. distance = abs(top - match_top)
  1311. if distance < candidate_distance:
  1312. candidate_word = word
  1313. candidate_distance = distance
  1314. meta["candidate_word"] = candidate_word
  1315. meta["candidate_distance"] = candidate_distance
  1316. return meta
  1317. iter = []
  1318. scanned_dir = agent.pdf2img()
  1319. # 目录一般位于前20页
  1320. start = 0
  1321. end = 20 if total_pages > 20 else total_pages
  1322. is_enter = False
  1323. for index in range(start, end):
  1324. img_path = os.path.join(scanned_dir, f'page-{index}.jpg')
  1325. processed_img_path = os.path.join(scanned_dir, f'page-{index}_red_roi.jpg')
  1326. # 去除红章
  1327. if not os.path.exists(processed_img_path):
  1328. processed_img = remove_red_seal(image_path=img_path)
  1329. cv2.imwrite(processed_img_path, processed_img)
  1330. # 对处理过红章的页面进行ocr
  1331. content = self.ocr.get_content(image_path=processed_img_path)
  1332. image_info = content["rawjson"]["ret"]
  1333. if not is_enter and self.ocr.search(image_info, keyword):
  1334. # 当前为目录页面首页,标记is_enter
  1335. is_enter = True
  1336. # 已经进入目录页面
  1337. if is_enter:
  1338. for id, cover_keywords in iter_keywords.items():
  1339. meta = self.ocr.pack_search(image_info, cover_keywords)
  1340. if len(meta) == 0:
  1341. continue
  1342. for meta_info in meta:
  1343. word = meta_info['word']
  1344. logger.info(f"processing iter word: {word}")
  1345. contain_key = meta_info['contain_key']
  1346. bbox = meta_info['bbox']
  1347. # 查看word所对应序列号
  1348. # check word first
  1349. if '、' in word or ('章' in word and '第' in word):
  1350. next_index = refine(word)
  1351. if next_index != "":
  1352. iter.append({
  1353. "current_key": contain_key,
  1354. "next_index": next_index
  1355. })
  1356. else:
  1357. # check ocr second
  1358. meta = find_ocr_index(image_info, bbox)
  1359. candidate_word = meta["candidate_word"]
  1360. next_index = refine(candidate_word)
  1361. iter.append({
  1362. "current_key": contain_key,
  1363. "next_index": next_index
  1364. })
  1365. data["iter"] = iter
  1366. return iter
  1367. def parse_bid(self, agent, data, total_pages):
  1368. # TODO 由于投标函主要出现在前30页,暂时只搜寻前30页
  1369. start_page = 0
  1370. end_page = 30 if total_pages > 30 else total_pages
  1371. scanned_dir = agent.pdf2img()
  1372. key_list = ['一、投标函及投标函附录', '1投标函及投标函附录', '1、投标函及投标函附录', '投标函及投标函附录', '投标函', '一、投标函', '1.投标函', '1投标函', '一投标函', '(一)投标函', '(一)投标函', '(一)、投标函', '(一)、投标函']
  1373. for index in range(start_page, end_page + 1):
  1374. img_path = os.path.join(scanned_dir, f'page-{index}.jpg')
  1375. # 先判断该页内容是否为投标函
  1376. content = self.ocr.get_content(image_path=img_path)
  1377. image_info = content["rawjson"]["ret"]
  1378. kw_search_meta = self.ocr.exact_search(image_info, key_list)
  1379. kw_search_res = self.ocr.font_judge(kw_search_meta)
  1380. ol_search_res = self.ocr.search(image_info, ['目录'])
  1381. if (not kw_search_res) or ol_search_res:
  1382. continue
  1383. result = self.ocr.signature_recognition(image_path=img_path)
  1384. if result:
  1385. data["has_signature_or_seal"] = True
  1386. data["formatting_img"] = img_path
  1387. return
  1388. def parse_license(self, agent, iter, data, firm_name):
  1389. # 先找寻contain_key的page,再找寻next_index的page
  1390. necessity_interval = []
  1391. # 遍历得到的每一个上下章
  1392. for unit_iter in iter:
  1393. contain_key = unit_iter["current_key"]
  1394. next_index = unit_iter["next_index"]
  1395. kw_title_meta = agent.search_in_title(contain_key)
  1396. iter_title_meta = agent.search_in_title(next_index, digit_limit=True)
  1397. left = 10000
  1398. right = -1
  1399. left_kw = ""
  1400. right_kw = ""
  1401. # 先确定right page
  1402. if len(iter_title_meta) == 0:
  1403. right = agent.total_pages
  1404. else:
  1405. for iter_meta in iter_title_meta:
  1406. page_number = iter_meta["page_number"]
  1407. iter_text = iter_meta["text"]
  1408. if page_number < 20:
  1409. continue
  1410. else:
  1411. if page_number > right:
  1412. right = page_number
  1413. right_kw = iter_text
  1414. if right == -1:
  1415. right = agent.total_pages
  1416. # 再确定left page
  1417. if len(kw_title_meta) == 0:
  1418. continue
  1419. else:
  1420. for kw_meta in kw_title_meta:
  1421. page_number = kw_meta["page_number"]
  1422. title_text = kw_meta["text"]
  1423. if page_number < 20 or page_number > right:
  1424. continue
  1425. else:
  1426. if page_number < left:
  1427. left = page_number
  1428. left_kw = title_text
  1429. if left == 10000:
  1430. continue
  1431. necessity_interval.append((left, right))
  1432. data["necessity_interval"].append(
  1433. {
  1434. "left_kw": left_kw,
  1435. "right_kw": right_kw,
  1436. "left_page": left,
  1437. "right_page": right
  1438. }
  1439. )
  1440. candidate_images = agent.find_candidate_images_pro(necessity_interval=necessity_interval)
  1441. # candidate_images = agent.find_candidate_images()
  1442. logger.info(candidate_images)
  1443. # import pdb; pdb.set_trace()
  1444. if len(candidate_images) == 0:
  1445. scanned_dir = agent.pdf2img()
  1446. for index in range(0, agent.total_pages):
  1447. img_path = os.path.join(scanned_dir, f'page-{index}.jpg')
  1448. processed_img_path = os.path.join(scanned_dir, f'page-{index}_red_roi.jpg')
  1449. if not os.path.exists(processed_img_path):
  1450. processed_img = remove_red_seal(image_path=img_path)
  1451. cv2.imwrite(processed_img_path, processed_img)
  1452. try:
  1453. response = self.ocr.judge_pro(image_path=processed_img_path, firm_name=firm_name)
  1454. if response == None or response['qualtified'] == None:
  1455. continue
  1456. else:
  1457. data["license_list"].append({
  1458. "license_name": response["license_name"],
  1459. "license_path": img_path,
  1460. "license_page": response["license_page"],
  1461. "start_datetime": response["start_datetime"],
  1462. "end_datetime": response["end_datetime"]
  1463. })
  1464. except ValueError as e:
  1465. print(e)
  1466. else:
  1467. for img in candidate_images:
  1468. try:
  1469. response = self.ocr.judge_pro(image_path=img, firm_name=firm_name)
  1470. if response == None or response['qualtified'] == None:
  1471. continue
  1472. else:
  1473. data["license_list"].append({
  1474. "license_name": response["license_name"],
  1475. "license_path": img,
  1476. "license_page": response["license_page"],
  1477. "start_datetime": response["start_datetime"],
  1478. "end_datetime": response["end_datetime"]
  1479. })
  1480. except ValueError as e:
  1481. print(e)
  1482. def parse_bid_form(self, agent, data):
  1483. result = agent.find_bid_quotation_form()
  1484. if result is None:
  1485. # 先转扫描件
  1486. scanned_dir = agent.pdf2img()
  1487. key_column = '增值税金额'
  1488. img_list = glob.glob(os.path.join(scanned_dir, '*.jpg'))
  1489. for img_prefix in img_list:
  1490. img_name = os.path.basename(img_prefix)
  1491. if ('roi' in img_name) or ('ink' in img_name):
  1492. continue
  1493. img_index = int(img_name.split('-')[1].split('.')[0])
  1494. if img_index > 50:
  1495. continue
  1496. img_path = os.path.join(scanned_dir, img_name)
  1497. #TODO 添加对"投标报价汇总表"字样的ocr辅助
  1498. expectation = self.ocr.table_parse(image_path=img_path, save_folder=scanned_dir)
  1499. content = self.ocr.get_content(image_path=img_path)
  1500. image_info = content["rawjson"]["ret"]
  1501. kw_res = self.ocr.search(image_info=image_info, key_list=['投标报价汇总表'])
  1502. table_list = expectation['table']['content']
  1503. if len(table_list) > 0:
  1504. for table in table_list:
  1505. column_list, parsed_table = agent.extract_table(table=table)
  1506. for column_name in column_list:
  1507. if key_column in column_name:
  1508. data["bid_form"] = {
  1509. "page": [img_index],
  1510. "table": parsed_table
  1511. }
  1512. return
  1513. if kw_res:
  1514. data["bid_form"] = {
  1515. "page": [img_index]
  1516. }
  1517. else:
  1518. page_number, target_table = result
  1519. data["bid_form"] = {
  1520. "page": page_number,
  1521. "table": target_table
  1522. }
  1523. def parse_perf(self, agent, data):
  1524. perf_meta = agent.search_perf_info()
  1525. # import pdb; pdb.set_trace()
  1526. if perf_meta["table"] is not None:
  1527. data["perf_info"].append({
  1528. "perf_page": perf_meta["perf_page_number"],
  1529. "perf_table": perf_meta["table"]
  1530. })
  1531. else:
  1532. center_page = 0
  1533. if perf_meta["perf_page_number"] != -1:
  1534. center_page = perf_meta["perf_page_number"]
  1535. if len(perf_meta["qual_page_number"]) > 0:
  1536. tmp = 10000
  1537. for candidate_page in perf_meta["qual_page_number"]:
  1538. if candidate_page > agent.start_threshold:
  1539. tmp = min(tmp, candidate_page)
  1540. center_page = min(center_page, tmp)
  1541. scanned_dir = agent.pdf2img()
  1542. img_list = glob.glob(os.path.join(scanned_dir, 'page-*.jpg'))
  1543. for img_prefix in img_list:
  1544. img_name = os.path.basename(img_prefix)
  1545. if ('roi' in img_name) or ('ink' in img_name):
  1546. continue
  1547. img_index = int(img_name.split('-')[1].split('.')[0])
  1548. if img_index >= center_page:
  1549. img_path = os.path.join(scanned_dir, img_name)
  1550. # 1st step: 移除红色印章
  1551. processed_path = os.path.join(scanned_dir, f'page-{img_index}_red_roi.jpg')
  1552. processed_folder = os.path.join(scanned_dir, 'processed')
  1553. os.makedirs(processed_folder, exist_ok=True)
  1554. if not os.path.exists(processed_path):
  1555. processed_img = remove_red_seal(img_path)
  1556. cv2.imwrite(processed_path, processed_img)
  1557. # 2nd step: 调用ocr搜寻关键字
  1558. content = self.ocr.get_content(image_path=processed_path)
  1559. image_info = content["rawjson"]["ret"]
  1560. if self.ocr.search(image_info, ['类似']):
  1561. # 3rd step: 识别表格
  1562. expectation = self.ocr.table_parse(image_path=processed_path, save_folder=processed_folder)
  1563. table_list = expectation['table']['content']
  1564. data["perf_info"].append({
  1565. "perf_page": img_index + 1,
  1566. "perf_table": table_list
  1567. })
  1568. def parse_manager(self, agent, data, kw_meta=None):
  1569. keywords = ['拟投入本项目人员配备情况表', '项目管理机构组成表', '项目管理机构成员', '项目管理组成表', '职务', '职称']
  1570. meta = agent.find_itempeople_form()
  1571. if len(meta["table_list"]) > 0:
  1572. # 找到类似表格
  1573. data["manager"] = meta["table_list"]
  1574. else:
  1575. candidate_page_set = meta["candidate_page"]
  1576. if len(candidate_page_set) == 0 and (kw_meta is None or len(kw_meta) == 0):
  1577. logger.info("查询候选项目经理为空, 开始进行全文档搜索")
  1578. scanned_dir = agent.pdf2img()
  1579. for index in range(0, agent.total_pages):
  1580. raw_page = os.path.join(scanned_dir, f'page-{index}.jpg')
  1581. processed_page = os.path.join(scanned_dir, f'page-{index}_red_roi.jpg')
  1582. if not os.path.exists(processed_page):
  1583. processed_img = remove_red_seal(image_path=raw_page)
  1584. cv2.imwrite(processed_page, processed_img)
  1585. # 对处理过红章的页面进行ocr
  1586. content = self.ocr.get_content(image_path=processed_page)
  1587. image_info = content["rawjson"]["ret"]
  1588. if self.ocr.search(image_info, keywords):
  1589. expectation = self.ocr.table_parse(image_path=processed_page, save_folder=scanned_dir)
  1590. table_list = expectation['table']['content']
  1591. if len(table_list) > 0:
  1592. for table in table_list:
  1593. column_list, parsed_table = agent.extract_table(table=table)
  1594. for column_name in column_list:
  1595. if '职称' in column_name or '职务' in column_name:
  1596. data["manager"].append(parsed_table)
  1597. else:
  1598. spread_set = set()
  1599. # from candidate_page_set
  1600. for candidate_page in candidate_page_set:
  1601. cnt = 0
  1602. while cnt <= 20 and candidate_page + cnt < agent.total_pages:
  1603. spread_set.add(candidate_page + cnt)
  1604. cnt += 1
  1605. # from meta
  1606. if kw_meta is not None and len(kw_meta) > 0:
  1607. for unit_meta in kw_meta:
  1608. label_page = unit_meta["label_page"]
  1609. if label_page.isdigit():
  1610. label_page = int(label_page)
  1611. cnt = -5
  1612. while cnt <= 5 and label_page + cnt < agent.total_pages:
  1613. spread_set.add(label_page + cnt)
  1614. cnt += 1
  1615. # 给每一个候选图片20区域范围
  1616. scanned_dir = agent.pdf2img()
  1617. for candidate_img in spread_set:
  1618. candidate_path = os.path.join(scanned_dir, f'page-{candidate_img}.jpg')
  1619. expectation = self.ocr.table_parse(image_path=candidate_path, save_folder=scanned_dir)
  1620. table_list = expectation['table']['content']
  1621. if len(table_list) > 0:
  1622. for table in table_list:
  1623. column_list, parsed_table = agent.extract_table(table=table)
  1624. for column_name in column_list:
  1625. if '职称' in column_name or '职务' in column_name:
  1626. data["manager"].append(parsed_table)
  1627. if __name__ == "__main__":
  1628. # [测试demo]
  1629. start_time = time.time()
  1630. # 请针对自己的环境进行修改log_path
  1631. global logger
  1632. firm_list = ['太原重工']
  1633. # firm_list = ['湖北海光']
  1634. for firm in firm_list:
  1635. log_path = f"/home/stf/miner_pdf/interface/test_outdir/manager_test/test_{firm}.log"
  1636. logger = create_logger(log_path=log_path)
  1637. # [环境参数]
  1638. # ocr url
  1639. url = "http://120.48.103.13:18000/ctr_ocr"
  1640. # seal_ocr url
  1641. base_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/seal?access_token="
  1642. # seal_ocr access_token
  1643. access_token = "24.6bbe9987c6bd19ba65e4402917811657.2592000.1724573148.282335-86574608"
  1644. # seal request url
  1645. seal_url = base_url + access_token
  1646. # seal_ocr headers
  1647. headers = {'content-type': 'application/x-www-form-urlencoded'}
  1648. # data_path为存储所有投标公司的起始路径
  1649. data_path = "/home/stf/miner_pdf/data/投标公司pdf"
  1650. # test_data_path为存储测试投标公司的起始路径
  1651. test_data_path = "/home/stf/miner_pdf/interface/test_files"
  1652. # test_out_path存储目前优化代码的测试结果!!!
  1653. test_out_path = "/home/stf/miner_pdf/interface/outdir/test_out.json"
  1654. unit_data_path = f"/home/stf/miner_pdf/interface/unit_test/{firm}"
  1655. # unit_out_path = f"/home/stf/miner_pdf/interface/outdir/unit_{firm}.json"
  1656. unit_out_path = f"/home/stf/miner_pdf/interface/test_outdir/manager_test/unit_{firm}.json"
  1657. # pipeline_out_path为执行所有公司pipeline逻辑后的输出位置
  1658. # 其为存放营业执照和资质证书位置信息的json文件
  1659. pipeline_out_path = "/home/stf/miner_pdf/interface/outdir/test_pipeline.json"
  1660. # single_out_path为执行单个公司pdf解析逻辑后的输出位置
  1661. # 其为存放营业执照和资质证书位置信息的json文件
  1662. single_out_path = "/home/stf/miner_pdf/interface/outdir/test_single.json"
  1663. # ground_truth目前为存储所有非扫描公司在pdf中营业执照与资质证书的json文件
  1664. ground_truth = "/home/stf/miner_pdf/ground_truth.json"
  1665. # 用于区分该公司提供的pdf文件为(扫描件 or 非扫描件)
  1666. firm_excel_file = "/home/stf/miner_pdf/data/certificate.xlsx"
  1667. df = pd.read_excel(firm_excel_file)
  1668. # 封装好的ocr接口
  1669. ocr = OcrAgent(url=url)
  1670. ocr.integrate_sealagent(
  1671. url=seal_url,
  1672. headers=headers
  1673. )
  1674. # 封装好的pipeline
  1675. pipeline = PdfParse_pipeline(
  1676. ocr=ocr,
  1677. firm_dir=unit_data_path,
  1678. out_path=unit_out_path,
  1679. )
  1680. # start
  1681. data = pipeline.parse_pipeline()
  1682. # caculate time cost
  1683. cost_time = time.time() - start_time
  1684. logger.info(f"processing {len(data)} documents, total cost {cost_time // 60} min {cost_time % 60} sec ...")