extract_financial_report.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import os
  2. import re
  3. import time
  4. from re import match
  5. from tqdm import tqdm
  6. from scan_dir import scan_dir
  7. from instance_locate import get_instances_by_title
  8. from ocr_api import OcrAgent, find_current_row
  9. import datetime
  10. def is_price(word: str) -> bool:
  11. pattern = (
  12. r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв"
  13. r"|ден|RM|MT|lei|zł|USD|GBP|EUR|JPY|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR"
  14. r"|PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  15. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)"
  16. )
  17. char_set = set('1234567890,.')
  18. if re.fullmatch(pattern, word):
  19. return True
  20. elif sum([0 if s in char_set else 1 for s in word]) == 0:
  21. return True
  22. else:
  23. return False
  24. def extract_financial_report(path: str, year: int = None) -> list:
  25. """
  26. 财报解析
  27. Args:
  28. path:
  29. year:
  30. Returns:
  31. results
  32. """
  33. instances = get_instances_by_title(
  34. path,
  35. ['财务状况', '{}年审计报告'.format(year - 1), '{}年审计报告'.format(year - 2)]
  36. )
  37. results = []
  38. ocr_agent = OcrAgent("http://120.48.103.13:18000/ctr_ocr")
  39. for item in instances:
  40. if item['tables']:
  41. table_name = [t['table_name'] for t in item['tables']]
  42. profits = []
  43. for table in item['tables']:
  44. profit = []
  45. for row in table['table']:
  46. if list(filter(lambda x: match(r'.*利润.*', x) is not None, row)):
  47. profit.append(row)
  48. profits.append(profit)
  49. results.append({
  50. 'title': table_name,
  51. 'result': profits,
  52. 'pages': [i['page_numbers'] for i in item['tables']],
  53. 'chapter': item['title']
  54. })
  55. elif item['page_number'] >= item['end_page']:
  56. print('Wrong titles extracted at {}'.format(item['title']))
  57. else:
  58. images = list(filter(
  59. lambda x: (item['page_number'] <= int(x.split('_')[2]) <= item['end_page'])
  60. and (x.endswith('.jpg') or x.endswith('.png'))
  61. and os.path.isfile(os.path.join(item['image_loc'], x)),
  62. os.listdir(item['image_loc']))
  63. )
  64. # for image in images:
  65. # ocr = table_pic_ocr(os.path.join(item['image_loc'], image))
  66. # pass
  67. '''paddleOCR abandoned
  68. ocr_results = table_pic_ocr_batch([os.path.join(item['image_loc'], image) for image in images])
  69. candidate = []
  70. for i in range(len(images)):
  71. page = images[i]
  72. for data in ocr_results[i]:
  73. if data['type'] in ('header', 'footer', 'table_caption', 'figure_caption', 'title'):
  74. for text in data['res']:
  75. if '利润' in text['text']:
  76. candidate.append(page)
  77. break
  78. elif data['type'] in ('text', 'figure'):
  79. for text in data['res']:
  80. if '净利润' in text['text']:
  81. candidate.append(page)
  82. break
  83. elif data['type'] in ('table',):
  84. table = pd.read_html(data['res']['html'])[0].values.tolist()
  85. for row in table:
  86. if '净利润' in ''.join([str(i) for i in row]):
  87. candidate.append(page)
  88. break
  89. else:
  90. for text in data['res']:
  91. if '净利润' in text['text']:
  92. candidate.append(page)
  93. break
  94. '''
  95. print('未找到表格 图片识别中')
  96. ocr_results = [ocr_agent.get_content(os.path.join(item['image_loc'], i))['rawjson']['ret'] for i in
  97. tqdm(images)]
  98. candidate = []
  99. rows = []
  100. print('结果分析中')
  101. for i, ret in tqdm(enumerate(ocr_results)):
  102. for res in ret:
  103. if re.match(r'.*(净利润).*', res['word']) is not None:
  104. top = res['rect']['top']
  105. bottom = res['rect']['top'] - res['rect']['height']
  106. candidate.append(
  107. {
  108. 'page': images[i],
  109. 'text': res['word'],
  110. 'top': top,
  111. 'bottom': bottom,
  112. }
  113. )
  114. rows.append(find_current_row(ret, top, bottom))
  115. for it in candidate:
  116. print('定位:\t{}\t定位词:\t{}'.format(it['page'], it['text']))
  117. for i, row in enumerate(rows):
  118. title = []
  119. profits = []
  120. for w in row:
  121. if is_price(w['word']):
  122. profits.append(w['word'])
  123. else:
  124. title.append(w['word'])
  125. if title and profits:
  126. results.append({
  127. 'chapter': item['title'],
  128. 'page': candidate[i]['page'],
  129. 'title': title,
  130. 'result': profits
  131. })
  132. pass
  133. pass
  134. return results
  135. if __name__ == '__main__':
  136. # print(extract_financial_report('./投标文件-修改版9-5-1-1.pdf'))
  137. os.environ["TRANSFORMERS_OFFLINE"] = '1'
  138. y = datetime.datetime.now().year
  139. print(
  140. extract_financial_report(
  141. '/home/zzh/ocr/pdf/美华建设有限公司/投标文件111.pdf',
  142. # '/home/zzh/ocr/pdf/南方电网数字电网研究院有限公司/南方电网数字研究院有限公司.pdf',
  143. # '/home/zzh/pdf_title_image/投标文件-修改版9-5-1-1.pdf',
  144. 2022
  145. )
  146. )
  147. # start = time.time()
  148. # fs = scan_dir('/home/zzh/ocr/pdf/', 'pdf')
  149. #
  150. # for f in fs:
  151. # try:
  152. # print(f)
  153. # print(extract_financial_report(f, 2022))
  154. # print('\n*********Runtime {} s*********\n'.format(time.time() - start))
  155. # except:
  156. # print('Something wrong')
  157. #
  158. # print('\n\n{}'.format(time.time() - start))