extract_financial_report.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. import os
  2. import re
  3. import time
  4. from re import match
  5. from tqdm import tqdm
  6. from scan_dir import scan_dir
  7. from instance_locate import get_instances_by_title
  8. from ocr_api import OcrAgent, find_current_row
  9. import datetime
  10. def is_price(word: str) -> bool:
  11. pattern = (
  12. r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв"
  13. r"|ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR"
  14. r"|PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  15. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)"
  16. )
  17. char_set = set('1234567890,.')
  18. if re.fullmatch(pattern, word):
  19. return True
  20. elif sum([0 if s in char_set else 1 for s in word]) == 0:
  21. return True
  22. else:
  23. return False
  24. def extract_financial_report(path: str, year: int = None):
  25. instances = get_instances_by_title(path,
  26. ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)])
  27. results = []
  28. ocr_agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
  29. for item in instances:
  30. if item['tables']:
  31. table_name = [t['table_name'] for t in item['tables']]
  32. profits = []
  33. for table in item['tables']:
  34. profit = []
  35. for row in table['table']:
  36. if list(filter(lambda x: match(r'.*利润.*', x) is not None, row)):
  37. profit.append(row)
  38. profits.append(profit)
  39. results.append({
  40. 'title': table_name,
  41. 'result': profits,
  42. 'pages': [i['page_numbers'] for i in item['tables']],
  43. 'chapter': item['title']
  44. })
  45. elif item['page_number'] >= item['end_page']:
  46. print('Wrong titles extracted at {}'.format(item['title']))
  47. else:
  48. images = list(filter(
  49. lambda x: (item['page_number'] <= int(x.split('_')[2]) <= item['end_page'])
  50. and (x.endswith('.jpg') or x.endswith('.png'))
  51. and os.path.isfile(os.path.join(item['image_loc'], x)),
  52. os.listdir(item['image_loc']))
  53. )
  54. # for image in images:
  55. # ocr = table_pic_ocr(os.path.join(item['image_loc'], image))
  56. # pass
  57. '''paddleOCR abandoned
  58. ocr_results = table_pic_ocr_batch([os.path.join(item['image_loc'], image) for image in images])
  59. candidate = []
  60. for i in range(len(images)):
  61. page = images[i]
  62. for data in ocr_results[i]:
  63. if data['type'] in ('header', 'footer', 'table_caption', 'figure_caption', 'title'):
  64. for text in data['res']:
  65. if '利润' in text['text']:
  66. candidate.append(page)
  67. break
  68. elif data['type'] in ('text', 'figure'):
  69. for text in data['res']:
  70. if '净利润' in text['text']:
  71. candidate.append(page)
  72. break
  73. elif data['type'] in ('table',):
  74. table = pd.read_html(data['res']['html'])[0].values.tolist()
  75. for row in table:
  76. if '净利润' in ''.join([str(i) for i in row]):
  77. candidate.append(page)
  78. break
  79. else:
  80. for text in data['res']:
  81. if '净利润' in text['text']:
  82. candidate.append(page)
  83. break
  84. '''
  85. print('未找到表格 图片识别中')
  86. ocr_results = [ocr_agent.get_content(os.path.join(item['image_loc'], i))['rawjson']['ret'] for i in
  87. tqdm(images)]
  88. candidate = []
  89. rows = []
  90. print('结果分析中')
  91. for i, ret in tqdm(enumerate(ocr_results)):
  92. for res in ret:
  93. if re.match(r'.*(净利润).*', res['word']) is not None:
  94. top = res['rect']['top']
  95. bottom = res['rect']['top'] - res['rect']['height']
  96. candidate.append(
  97. {
  98. 'page': images[i],
  99. 'text': res['word'],
  100. 'top': top,
  101. 'bottom': bottom,
  102. }
  103. )
  104. rows.append(find_current_row(ret, top, bottom))
  105. for it in candidate:
  106. print('定位:\t{}\t定位词:\t{}'.format(it['page'], it['text']))
  107. for i, row in enumerate(rows):
  108. title = []
  109. profits = []
  110. for w in row:
  111. if is_price(w['word']):
  112. profits.append(w['word'])
  113. else:
  114. title.append(w['word'])
  115. if title and profits:
  116. results.append({
  117. 'chapter': item['title'],
  118. 'page': candidate[i]['page'],
  119. 'title': title,
  120. 'result': profits
  121. })
  122. pass
  123. pass
  124. return results
  125. if __name__ == '__main__':
  126. # print(extract_financial_report('./投标文件-修改版9-5-1-1.pdf'))
  127. os.environ["TRANSFORMERS_OFFLINE"] = '1'
  128. y = datetime.datetime.now().year
  129. print(extract_financial_report(
  130. '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
  131. # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
  132. # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  133. 2022
  134. ))
  135. # start = time.time()
  136. # fs = scan_dir('/home/zzh/ocr/pdf/', 'pdf')
  137. #
  138. # for f in fs:
  139. # try:
  140. # print(f)
  141. # print(extract_financial_report(f, 2022))
  142. # print('\n*********Runtime {} s*********\n'.format(time.time() - start))
  143. # except:
  144. # print('Something wrong')
  145. #
  146. # print('\n\n{}'.format(time.time() - start))