preprocess.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2023-12-25 10:19:57
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2024-02-19 11:43:01
  6. import pandas as pd
  7. from urllib import parse
  8. all_keys = []
  9. # 解析URL
  10. def parse_url(url):
  11. query_string = parse.urlparse(url)
  12. return query_string.scheme, query_string.netloc, query_string.path, query_string.params, query_string.fragment, parse.parse_qs(query_string.query)
  13. # 解析路径
  14. def parse_path(url):
  15. path_string = parse.urlparse(url).path
  16. return path_string
  17. # 解析appCode参数
  18. def parse_appCode(url):
  19. query_string = parse.urlparse(url).query
  20. appCode = parse.parse_qs(query_string).get('appCode')
  21. if appCode:
  22. return appCode[0]
  23. # 解析内嵌页URL
  24. def parse_iframeUrl(url):
  25. fragment_string = parse.urlparse(url).fragment
  26. iframe_url = parse.parse_qs(fragment_string).get('url')
  27. iframe_from = parse.parse_qs(fragment_string).get('from')
  28. if iframe_url:
  29. path = parse.urlparse(iframe_url[0]).path
  30. if path and iframe_from:
  31. return path, iframe_from[0]
  32. elif path:
  33. return path, None
  34. elif iframe_from:
  35. return None, iframe_from[0]
  36. # 解析appcontext
  37. def parse_appcontext(url):
  38. query_string = parse.urlparse(url).query
  39. appcontext = parse.parse_qs(query_string).get('appcontext')
  40. if appcontext:
  41. return appcontext[0]
  42. # 解析编辑参数
  43. def parse_Edit(url):
  44. query_string = parse.urlparse(url).query
  45. isEdit = parse.parse_qs(query_string).get('isEdit')
  46. if isEdit:
  47. return isEdit[0]
  48. # 解析查询参数
  49. def parse_Query(url):
  50. query_string = parse.urlparse(url).query
  51. isQuery = parse.parse_qs(query_string).get('isQuery')
  52. if isQuery:
  53. return isQuery[0]
  54. # 解析编辑标签
  55. def parse_editFlag(url):
  56. query_string = parse.urlparse(url).query
  57. editFlag = parse.parse_qs(query_string).get('editFlag')
  58. if editFlag:
  59. return editFlag[0]
  60. """
  61. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="基建管理应用")
  62. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="数字供应链")
  63. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="合同管理")
  64. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="安全生产")
  65. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="创新管理")
  66. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="基建智慧工程")
  67. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="并网服务管理")
  68. # df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="基础应用")
  69. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="计划预算管理")
  70. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="成本管理")
  71. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="资金管理")
  72. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="核算管理")
  73. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="报账管理")
  74. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="工程财务管理")
  75. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="资产价值管理")
  76. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="物资财务管理")
  77. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="价格管理")
  78. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="税务管理")
  79. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="会计档案")
  80. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="共享服务")
  81. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="报表管理")
  82. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="综合管理")
  83. # df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="基础应用")
  84. df['tag'] = df['Unnamed: 1'].apply(lambda x: x.split()[2])
  85. df['url'] = df['Unnamed: 1'].apply(lambda x: x.split()[1])
  86. del df['Unnamed: 0']
  87. del df['Unnamed: 1']
  88. df.drop(df[df['tag'] == 'undefined'].index, inplace=True)
  89. df.to_excel("temp.xlsx", sheet_name='Sheet1')
  90. """
  91. # df['tag'] = df['url'].apply(lambda x: x.split("/")[-1].split("=")[-1])
  92. """
  93. df = pd.read_excel("资产域关联字段查询表.xlsx", sheet_name="资产域")
  94. del df['域']
  95. df['domain'] = '资产域'
  96. df['path'] = df['url'].apply(lambda x: parse_path(x))
  97. df['appCode'] = df['url'].apply(lambda x: parse_appCode(x))
  98. del df['url']
  99. print(df)
  100. df.to_json('资产域.json', orient='records', lines=True, force_ascii=False)
  101. """
  102. """
  103. df = pd.read_excel("财务域关联字段查询表.xlsx", sheet_name="财务域")
  104. df['domain'] = '财务域'
  105. df['path'] = df['url'].apply(lambda x: parse_path(x))
  106. df['appCode'] = df['url'].apply(lambda x: parse_appCode(x))
  107. del df['url']
  108. print(df)
  109. df.to_json('财务域.json', orient='records', lines=True, force_ascii=False)
  110. """
  111. """
  112. df = pd.read_excel("营销域关联字段查询表.xlsx", sheet_name="营销域")
  113. df['一级标题'] = df['一级标题'].apply(lambda x: x.strip('\''))
  114. df['二级标题'] = df['二级标题'].apply(lambda x: x.strip('\''))
  115. df['三级标题'] = df['三级标题'].apply(lambda x: x.strip('\''))
  116. df['四级标题'] = df['四级标题'].apply(lambda x: x.strip('\'') if isinstance(x, str) else None)
  117. df['url'] = df['url'].apply(lambda x: x.strip('\''))
  118. df['domain'] = '营销域'
  119. df['path'] = df['url'].apply(lambda x: parse_path(x))
  120. df['appcontext'] = df['url'].apply(lambda x: parse_appcontext(x))
  121. df['isEdit'] = df['url'].apply(lambda x: parse_Edit(x))
  122. df['editFlag'] = df['url'].apply(lambda x: parse_editFlag(x))
  123. df['isQuery'] = df['url'].apply(lambda x: parse_Query(x))
  124. print(df)
  125. df.to_json('营销域.json', orient='records', lines=True, force_ascii=False)
  126. """
  127. """
  128. df = pd.read_excel("人资域关联字段查询表.xlsx", sheet_name="Sheet1")
  129. df['domain'] = '人资域'
  130. df['netloc'] = '10.10.21.23'
  131. df['path'] = df['url'].apply(lambda x: parse_path(x))
  132. df['appCode'] = df['url'].apply(lambda x: parse_appCode(x))
  133. df['iframe'] = df['url'].apply(lambda x: parse_iframeUrl(x))
  134. del df['url']
  135. print(df)
  136. df.to_json('人资域.json', orient='records', lines=True, force_ascii=False)
  137. """