record_parser.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. #!/usr/bin/python
  2. # -*- coding=utf-8 -*-
  3. # @Create Time: 2024-01-04 11:57:15
  4. # @Last Modified time: 2024-01-23 11:09:55
  5. from datetime import datetime
  6. import json
  7. import urllib
  8. # import pymysql
  9. import pandas as pd
  10. # pd.set_option('display.width', 1000)
  11. # pd.set_option('display.max_rows', None)
  12. # pd.set_option('display.max_columns', None)
  13. pd.set_option('max_colwidth', 1000)
  14. import logging
  15. from app.main.preprocess import parse_url
  16. """
  17. rzdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/人资域.json', orient='records', lines=True, encoding='utf-8')
  18. zcdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/资产域.json', orient='records', lines=True, encoding='utf-8')
  19. cwdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/财务域.json', orient='records', lines=True, encoding='utf-8')
  20. yxdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/营销域.json', orient='records', lines=True, encoding='utf-8')
  21. """
  22. """
  23. from flask.ext.sqlalchemy import SQLAlchemy
  24. db = SQLAlchemy()
  25. class Recorder(db.Model):
  26. __tablename__ = 'recoder'
  27. record_Time = db.Column(db.Time)
  28. current_Url = db.Column(db.Text)
  29. full_Url = db.Column(db.Text)
  30. sim_Url = db.Column(db.Text)
  31. user_Id = db.Column(db.Text)
  32. device_Id = db.Column(db.Text)
  33. request_Method = db.Column(db.Text)
  34. event_Type = db.Column(db.Text)
  35. form_Data = db.Column(db.Text)
  36. json_Data = db.Column(db.Text)
  37. value_Data = db.Column(db.Text)
  38. status_Code = db.Column(db.Text)
  39. target_Url = db.Column(db.Text)
  40. iframe_Url = db.Column(db.Text)
  41. button_Text = db.Column(db.Text)
  42. db = pymysql.connect(host="192.168.1.202", port=13389,
  43. user="root", passwd="zh123456",
  44. db="mzinfo", autocommit=True)
  45. cursor = db.cursor()
  46. """
  47. """
  48. class MYDATABASE:
  49. def __init__(self, db_name=None):
  50. '''连接数据库,创建游标'''
  51. config = dict(zip(['host', 'user', 'port', 'password'],
  52. ['192.168.1.202', 'root', 13388, 'zh123456']))
  53. config.update(database=db_name)
  54. self.connection = pymysql.connect(**config)
  55. self.cursor = self.connection.cursor()
  56. def __del__(self):
  57. self.connection.close()
  58. def create_database(self, db_name: str):
  59. '''新建数据库'''
  60. sql = f'CREATE DATABASE IF NOT EXISTS {db_name};'
  61. self.cursor.execute(sql)
  62. def create_table(self, tbl_name: str):
  63. '''新建数据库'''
  64. sql = f'CREATE TABLE IF NOT EXISTS {tbl_name};'
  65. self.cursor.execute(sql)
  66. def drop_database(self, db_name: str):
  67. '''删除数据库'''
  68. sql = f'DROP DATABASE IF EXISTS {db_name};'
  69. self.cursor.execute(sql)
  70. def drop_table(self, tbl_name: str):
  71. '''删除数据库'''
  72. sql = f'DROP TABLE IF EXISTS {tbl_name};'
  73. self.cursor.execute(sql)
  74. def query(self, sql: str):
  75. '''以数据框形式返回查询据结果'''
  76. self.cursor.execute(sql)
  77. data = self.cursor.fetchall() # 以元组形式返回查询数据
  78. header = [t[0] for t in self.cursor.description]
  79. df = pd.DataFrame(list(data), columns=header) # pd.DataFrame 对列表具有更好的兼容性
  80. return df
  81. def show_databases(self):
  82. '''查看服务器上的所有数据库'''
  83. sql = 'SHOW DATABASE;'
  84. return self.query(sql)
  85. def select_database(self):
  86. '''查看当前数据库'''
  87. sql = 'SELECT DATABASE();'
  88. return self.query(sql)
  89. def show_tables(self):
  90. '''查看当前数据库中所有的表'''
  91. sql = 'SHOW TABLES;'
  92. return self.query(sql)
  93. def insert_table(self, sql):
  94. try:
  95. # 执行sql语句
  96. cursor.execute(sql)
  97. self.connection.commit()
  98. except Exception as e:
  99. # 发生错误时回滚
  100. self.connection.rollback()
  101. mydb = MYDATABASE(db_name="mzinfo")
  102. df = mydb.query("SELECT * FROM recoder;")
  103. # print(df)
  104. df.to_json("temp.json", orient='records', lines=True, force_ascii=False)
  105. """
  106. df = pd.read_json(path_or_buf='all_data.json', orient='records', lines=True, encoding='utf-8')
  107. # df = pd.read_json(path_or_buf='temp.json', orient='records', lines=True, encoding='utf-8')
  108. # print(df['user_Id'].unique())
  109. # print(df.groupby('user_Id').size().sort_values())
  110. df = df[df['user_Id'] == 'liguiwen@mz.gd.csg.cn']
  111. # df = df.sort_values(by='record_Time')
  112. # print(df.head)
  113. # print(df.iloc[130]['current_Url'])
  114. # print(df['current_Url'].unique())
  115. # print(df['current_Url'].str.split("?"))
  116. # print(df['value_Data'].unique())
  117. # url = df.iloc[0]['current_Url']
  118. # print(len(parse_url(url)))
  119. # print(df['current_Url'].apply(lambda x: parse_url(x)))
  120. """
  121. df['current_Url'] = df['current_Url'].apply(lambda x: parse_url(x))
  122. df['target_Url'] = df['target_Url'].apply(lambda x: parse_url(x))
  123. df['iframe_Url'] = df['iframe_Url'].apply(lambda x: parse_url(x))
  124. df['value_Data'] = df['current_Url'].apply(lambda x: x[-1])
  125. """
  126. # print(df.sample(10, random_state=42)['current_Url'])
  127. class UrlParser:
  128. def __init__(self):
  129. self.rzdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/人资域.json', orient='records', lines=True, encoding='utf-8')
  130. self.zcdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/资产域.json', orient='records', lines=True, encoding='utf-8')
  131. self.cwdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/财务域.json', orient='records', lines=True, encoding='utf-8')
  132. self.yxdf = pd.read_json(path_or_buf='/mnt/d/desktop/梅州电网/营销域.json', orient='records', lines=True, encoding='utf-8')
  133. def process(self, row):
  134. row['current_Url'] = row['current_Url'].apply(lambda x: parse_url(x))
  135. # print(row.iloc[0])
  136. return self.process_record(row.iloc[0])
  137. def process_record(self, item):
  138. i = item['current_Url']
  139. j = item['target_Url']
  140. user = item['user_Id']
  141. # result = item
  142. result = None
  143. # if (item['request_Method'] == 'GET') and (not item['form_Data']):
  144. # print(0)
  145. # elif (item['request_Method'] == 'GET') and (item['form_Data'] != 'None'):
  146. # print(1, item['form_Data'])
  147. # elif (item['request_Method'] == 'POST') and (item['form_Data'] != 'None'):
  148. # print(2, item['form_Data'])
  149. # elif item['request_Method'] == 'POST' and (not item['form_Data']):
  150. # print(3)
  151. if i[1] == '10.10.21.23':
  152. result = {'domain': '人资域'}
  153. if i[5].get("appCode"):
  154. task = self.rzdf.query(f'''path == "{i[2]}" & appCode == "{i[5]['appCode'][0]}"''')
  155. result = {
  156. 'time': item['record_Time'],
  157. 'user': user,
  158. 'method': item['request_Method'],
  159. 'hasform': True if item['form_Data'] else False,
  160. 'domain': '人资域',
  161. '一级标题': task['一级标题'].values[0],
  162. '二级标题': task['二级标题'].values[0],
  163. '三级标题': task['三级标题'].values[0],
  164. 'text': item['button_Text'] if item['button_Text'] else None
  165. }
  166. elif i[1] == '10.10.21.28':
  167. result = {'domain': '资产域'}
  168. if i[5].get('appCode'):
  169. task = self.zcdf.query(f'''path == "{i[2]}" & appCode == "{i[5]['appCode'][0]}"''')
  170. result = {
  171. 'time': item['record_Time'],
  172. 'user': user,
  173. 'method': item['request_Method'],
  174. 'hasform': True if item['form_Data'] else False,
  175. 'domain': '资产域',
  176. '一级标题': task['一级标题'].values[0],
  177. '二级标题': task['二级标题'].values[0],
  178. '三级标题': task['三级标题'].values[0],
  179. 'text': item['button_Text'] if item['button_Text'] else None
  180. }
  181. elif i[1] == 'fms.gmp.cloud.hq.iv.csg':
  182. result = {'domain': '财务域'}
  183. if i[5].get('appCode'):
  184. task = self.cwdf.query(f'''path == "{i[2]}" & appCode == "{i[5]['appCode'][0]}"''')
  185. result = {
  186. 'time': item['record_Time'],
  187. 'user': user,
  188. 'method': item['request_Method'],
  189. 'hasform': True if item['form_Data'] else False,
  190. 'domain': '财务域',
  191. '一级标题': task['一级标题'].values[0],
  192. '二级标题': task['二级标题'].values[0],
  193. '三级标题': task['三级标题'].values[0],
  194. 'text': item['button_Text'] if item['button_Text'] else None
  195. }
  196. elif i[1] == '10.150.23.1:8010':
  197. result = {'domain': '营销域'}
  198. fd = item['form_Data']
  199. if item['form_Data'] and item['form_Data'] != 'None':
  200. try:
  201. if item['form_Data'][0] != "{":
  202. item["params_Data"] = item['form_Data']
  203. form_data = None
  204. else:
  205. item['form_Data'] = item['form_Data'].replace("\"remark\":\"[", "\"remark\":\"\"[")
  206. item['form_Data'] = item['form_Data'].replace("\"[", "[").replace("]\"", "]").replace("\"{", "{").replace("}\"", "}")
  207. item['form_Data'] = item['form_Data'].replace("object HTMLInputElement", "").replace("[null]", "[]").replace("\"null\"", "\"\"")
  208. item['form_Data'] = item['form_Data'].replace("\n", "")
  209. form_data = json.loads(item['form_Data'])
  210. except Exception as e:
  211. logging.error(item['form_Data'])
  212. logging.error(fd)
  213. form_data = None
  214. raise e
  215. else:
  216. form_data = None
  217. if form_data and '_INVOKE_FUNC_TITLE_' in form_data:
  218. title = form_data['_INVOKE_FUNC_TITLE_'][0]
  219. appcontext = form_data['_INVOKE_FUNC_URL_'][0].split('/')[1]
  220. logging.debug(appcontext)
  221. task = self.yxdf.query(f'''四级标题 == "{title}" & appcontext == "{appcontext}"''')
  222. if task.empty:
  223. task = self.yxdf.query(f'''三级标题 == "{title}" & appcontext == "{appcontext}"''')
  224. if task.empty:
  225. task = self.yxdf.query(f'''四级标题 == "{title}"''')
  226. if task.empty:
  227. task = self.yxdf.query(f'''三级标题 == "{title}"''')
  228. try:
  229. result = {
  230. 'time': item['record_Time'],
  231. 'user': user,
  232. 'method': item['request_Method'],
  233. 'hasform': True,
  234. 'domain': '营销域',
  235. '一级标题': task['一级标题'].values[0],
  236. '二级标题': task['二级标题'].values[0],
  237. '三级标题': task['三级标题'].values[0],
  238. '四级标题': task['四级标题'].values[0],
  239. 'text': item['button_Text'] if item['button_Text'] else None
  240. }
  241. except Exception as e:
  242. logging.error(task)
  243. logging.error(item['form_Data'])
  244. logging.error(title)
  245. logging.error(e)
  246. logging.error(form_data['_INVOKE_FUNC_URL_'][0])
  247. else:
  248. result = {
  249. 'time': item['record_Time'],
  250. 'user': user,
  251. 'domain': '营销域',
  252. 'hasform': False,
  253. 'text': item['button_Text'] if item['button_Text'] else None
  254. }
  255. elif i[1] == '4a.gd.csg.local':
  256. result = {
  257. 'time': item['record_Time'],
  258. 'user': user,
  259. '域': '登录门户'
  260. }
  261. return result
  262. # parser = UrlParser()
  263. # df['current_Url'] = df['current_Url'].apply(lambda x: parse_url(x))
  264. # # # for _, item in df.sample(1000, random_state=4).iterrows():
  265. # for _, item in df.sample(10, random_state=5).iterrows():
  266. # # for _ in range(5):
  267. # # for _, item in df.get_chunk(10).iterrows():
  268. # # for _, item in item.iterrows():
  269. # print(parser.process_record(item))
  270. # # logging.warning(parser.process_record(item))
  271. def main():
  272. current = df.iloc[0]['current_Url']
  273. target = dict()
  274. for item in df.itertuples():
  275. if item.current_Url == current: # 当前地址没变,页面无跳转(需判断AJAX)
  276. if item.target_Url not in target: # 目标url不在临时目录
  277. target[item.target_Url] = item.Index # 添加目标URL到临时目录
  278. else: # 当前地址发生变化,页面跳转
  279. current = item.current_Url # 当前地址修改
  280. if target.get(current): # 查看是否有请求当前URL的历史记录
  281. print(df.loc[target.get(current)]) # 查看历史记录
  282. else: # 为找到历史记录
  283. print("*********非正常跳转*********") # 非正常跳转
  284. target = dict() # 清除历史记录
  285. target[item.target_Url] = item.Index # 添加当前记录
  286. if __name__ == '__main__':
  287. main()