extract_financial_report.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2024-06-11 13:43:14
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-09-26 14:38:53
  6. import os
  7. import re
  8. from tqdm import tqdm
  9. from ocr import find_current_row
  10. from commonprocess import pic_ocr
  11. from instance_locate import get_instances_by_title
  12. def is_price(word: str) -> bool:
  13. pattern = (
  14. r"(?:\b(?:[BS]/\.|R(?:D?\$|p))|\b(?:[TN]T|[CJZ])\$|Дин\.|\b(?:Bs|Ft|Gs"
  15. r"|K[Mč]|Lek|B[Zr]|k[nr]|[PQLSR]|лв|ден|RM|MT|lei|zł|USD|GBP|EUR|JPY"
  16. r"|CHF|SEK|DKK|NOK|SGD|HKD|AUD|TWD|NZD|CNY|KRW|INR|CAD|VEF|EGP|THB|IDR"
  17. r"|PKR|MYR|PHP|MXN|VND|CZK|HUF|PLN|TRY|ZAR|ILS|ARS|CLP|BRL|RUB|QAR|AED"
  18. r"|COP|PEN|CNH|KWD|SAR)|\$[Ub]|"
  19. r"[^\w\s])\s?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d{1,2})?(?!\.?\d)"
  20. )
  21. char_set = set('1234567890,.')
  22. if re.fullmatch(pattern, word):
  23. return True
  24. elif sum([0 if s in char_set else 1 for s in word]) == 0:
  25. return True
  26. else:
  27. return False
  28. def extract_financial_report(title_list: list, table_list: list, image_list: list, year: int) -> list:
  29. """
  30. 财报解析
  31. Args:
  32. path:
  33. title_list: 标题列表
  34. table_list: 表格列表
  35. image_list: 图片列表
  36. year: 年份
  37. Returns:
  38. results
  39. """
  40. instances = get_instances_by_title(
  41. title_list=title_list,
  42. table_list=table_list,
  43. image_list=image_list,
  44. instances=[
  45. '财务状况',
  46. '近年财务状况表',
  47. '{}年审计报告'.format(year - 1),
  48. '{}年审计报告'.format(year - 2),
  49. '{}年财务审计报告'.format(year - 1),
  50. '{}年财务审计报告'.format(year - 2)
  51. ]
  52. )
  53. print("instances: ", instances)
  54. # TODO 后续内容出现问题
  55. # Wrong titles extracted at 2020 年度审计报告
  56. # Wrong titles extracted at 2021 年度审计报告
  57. # Wrong titles extracted at 附件二 近年财务状况表
  58. results = []
  59. for item in instances:
  60. if item['page_number'] >= item['end_page']:
  61. # if item['page_number'] > item['end_page']:
  62. print('Wrong titles extracted at {}'.format(item['title']))
  63. elif item['tables']:
  64. # table_name = [t['table_name'] for t in item['tables']]
  65. table_name = [t['table_name'] if t['table_name'] else item["title"] for t in item['tables']]
  66. profits = []
  67. for table in item['tables']:
  68. profit = []
  69. for row in table['table']:
  70. if list(filter(lambda x: re.match(r'.*利润.*', x) is not None, row)):
  71. profit.append(row)
  72. profits.append(profit)
  73. results.append({
  74. 'title': table_name,
  75. 'result': profits,
  76. 'pages': [i['page_numbers'] for i in item['tables']],
  77. 'chapter': item['title']
  78. })
  79. elif item.get('images'):
  80. print('未找到表格 图片识别中')
  81. print(item.get('images'))
  82. pages = [
  83. img['page_number'] for img in item.get('images')
  84. ]
  85. ocr_results = [
  86. # pic_ocr.apply_async(kwargs={'image_path': img['image_name']}).get(timeout=30)['rawjson']['ret']
  87. pic_ocr(image_path = img['image_name'])['rawjson']['ret']
  88. for img in item.get('images')
  89. ]
  90. candidate = []
  91. rows = []
  92. print('结果分析中')
  93. for i, ret in tqdm(enumerate(ocr_results)):
  94. for res in ret:
  95. if re.match(r'.*(净利润).*', res['word']) is not None:
  96. # if re.match(r'.*(利润).*', res['word']) is not None:
  97. top = res['rect']['top']
  98. bottom = res['rect']['top'] - res['rect']['height']
  99. candidate.append(
  100. {
  101. 'page': pages[i],
  102. 'text': res['word'],
  103. 'top': top,
  104. 'bottom': bottom,
  105. }
  106. )
  107. rows.append(find_current_row(ret, top, bottom))
  108. for it in candidate:
  109. print('定位:\t{}\t定位词:\t{}'.format(it['page'], it['text']))
  110. for i, row in enumerate(rows):
  111. title = []
  112. profits = []
  113. for w in row:
  114. if is_price(w['word']):
  115. profits.append(w['word'])
  116. else:
  117. title.append(w['word'])
  118. if title and profits:
  119. results.append({
  120. 'chapter': item['title'],
  121. 'page': candidate[i]['page'],
  122. 'title': title,
  123. 'result': profits
  124. })
  125. return results
  126. if __name__ == '__main__':
  127. pass
  128. # import json
  129. # import datetime
  130. # from settings import title_n_path, table_list_path, image_path
  131. # with open(title_n_path, 'r', encoding='utf-8') as fp:
  132. # title_list = json.load(fp)
  133. # with open(table_list_path, 'r', encoding='utf-8') as fp:
  134. # table_list = json.load(fp)
  135. # with open(image_path, 'r', encoding='utf-8') as fp:
  136. # image_list = json.load(fp)
  137. # y = datetime.datetime.now().year
  138. # print(
  139. # extract_financial_report(
  140. # title_list=title_list,
  141. # table_list=table_list,
  142. # image_list=image_list,
  143. # year=2022
  144. # )
  145. # )