wordtable-24-05-17.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. """
  2. 对 Word 中数据库设计表构建图谱
  3. 字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
  4. 字段类型判断:re.match(r'\w+\(\d+\)', s)
  5. 1、判断是否标准表 [依据]-> 判断是否包含列类型 -> 找到列
  6. 2、判断列名 [是]-> 判断是否符合列名规范
  7. 3、判断列中文名 [规则]-> 找到列中文名
  8. 4、判断列类型 [是]-> 判断是否符合列规范
  9. 5、判断主键 [未找到主键]-> 判断是否是ID
  10. 6、判断是否必填字段 [依据]-> 分类[是否为空|是否必填] -> [未找到字段]-> 设定默认值必填
  11. 7、判断列注释 [未找到注释]-> 中文名 [未找到中文名]-> 表名+列名 -> [列类型为bool、时间、流等]-> 添加UUID
  12. 8、字段唯一判断条件 <列名,列类型,注释>
  13. """
  14. import re
  15. import uuid
  16. import logging
  17. from collections import Counter
  18. import docx
  19. from docx import Document
  20. from docx.oxml.table import CT_Tbl
  21. from docx.oxml.text.paragraph import CT_P
  22. from docx.table import _Cell, Table
  23. from docx.text.paragraph import Paragraph
  24. import jieba
  25. import pandas as pd
  26. import networkx as nx
  27. import matplotlib.pyplot as plt
  28. # from text2vec import Similarity
  29. # sim = Similarity()
  30. from py2neo import Node, Graph, Relationship
  31. graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
  32. graph.delete_all()
  33. # # 列类型判断
  34. # with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
  35. # COLUMN_TYPES = {i.strip() for i in fp.readlines()}
  36. # # 列名判断
  37. # with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
  38. # COLUMN_DICT = {i.strip(): 'name' for i in fp.readlines()}
  39. # # 注释判断
  40. # with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
  41. # REMARK_DICT = {i.strip(): 'remark' for i in fp.readlines()}
  42. COLUMN_TYPES = {'int', 'bigint', 'tinyint', 'smallint', 'bigint unsigned', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
  43. TYPE_DICT = {'类型': 'column_type', '数据类型': 'column_type'}
  44. COLUMN_DICT = {'名称': 'name', '字段名': 'name', 'field name': 'name', '字段代码': 'name', '代码': 'name', '物理字段名': 'name'}
  45. C_NAME_DICT = {'字段中文名': 'c_name', '中文含义': 'c_name', '名字': 'c_name', '字段名称': 'c_name', '逻辑字段名': 'c_name'}
  46. REMARK_DICT = {'Alias': 'remark', 'description': 'remark', '说明': 'remark', '描述': 'remark', '备注': 'remark'}
  47. REQUIRED_DICT = {'空/非空': 'required', '可不可以为空': 'required', '是否为空': 'required', '允许空值': 'required', '是否必填': 'required', '空值': 'required'}
  48. PRIMARY_KEY_DICT = {'主键': 'primary_key'}
  49. FOREIGN_KEY_DICT = {'外键': 'foreign_key'}
  50. class LoggerHandler(logging.Logger):
  51. def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
  52. super().__init__(name)
  53. self.setLevel(logging.INFO)
  54. self.fmt = logging.Formatter(fmt)
  55. self.set_console_handler(console_handler_level)
  56. def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
  57. ch = logging.StreamHandler()
  58. ch.setLevel(console_handler_level)
  59. ch.setFormatter(self.fmt)
  60. self.addHandler(ch)
  61. logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
  62. class Word:
  63. def __init__(self, path: str, draw: bool = False) -> None:
  64. self.draw = draw
  65. self.doc = Document(path)
  66. if draw:
  67. self.G = nx.Graph()
  68. self.namecount = dict({})
  69. self.all_tables = pd.DataFrame()
  70. def iter_block_item(self, parent):
  71. if isinstance(parent, docx.document.Document):
  72. parent_elm = parent.element.body
  73. elif isinstance(parent, _Cell):
  74. parent_elm = parent._tc
  75. else:
  76. raise ValueError("something error")
  77. for child in parent_elm.iterchildren():
  78. if isinstance(child, CT_P):
  79. yield Paragraph(child, parent)
  80. elif isinstance(child, CT_Tbl):
  81. yield Table(child, parent)
  82. def parse(self) -> tuple:
  83. for block in self.iter_block_item(self.doc):
  84. if block.style.name == 'Heading 1' and block.text:
  85. yield ('Heading', block.text.lower())
  86. elif block.style.name == 'Heading 2' and block.text:
  87. yield ('Heading', block.text.lower())
  88. elif block.style.name == 'Heading 3' and block.text:
  89. yield ('Heading', block.text.lower())
  90. elif block.style.name == 'Heading 4' and block.text:
  91. yield ('Heading', block.text.lower())
  92. elif block.style.name == 'Heading 5' and block.text:
  93. yield ('Heading', block.text.lower())
  94. elif block.style.name == 'Heading 6' and block.text:
  95. yield ('Heading', block.text.lower())
  96. elif block.style.name == 'Normal' and block.text:
  97. yield ('Normal', block.text.lower())
  98. elif block.style.name == 'Table Grid':
  99. tables = []
  100. for row in block.rows:
  101. rows = []
  102. for cell in row.cells:
  103. for paragraph in cell.paragraphs:
  104. rows.append(paragraph.text.strip().lower())
  105. tables.append(rows)
  106. yield ('Table', tables)
  107. elif block.style.name == 'Normal Table':
  108. tables = []
  109. for row in block.rows:
  110. rows = []
  111. for cell in row.cells:
  112. for paragraph in cell.paragraphs:
  113. rows.append(paragraph.text.strip().lower())
  114. tables.append(rows)
  115. yield ('Table', tables)
  116. elif block.text:
  117. yield ('Unknow', block)
  118. # def clean_table(self, raw_table):
  119. # table = []
  120. # dirty_table = []
  121. # while raw_table and '' in raw_table[0]:
  122. # raw_table = raw_table[1:]
  123. # # 表格预处理
  124. # rowslen = [len(row) for row in raw_table]
  125. # if not rowslen:
  126. # return None
  127. # rowlen = Counter(rowslen).most_common(1)[0][0]
  128. # for i,l in enumerate(rowslen):
  129. # if l == rowlen:
  130. # table.append(raw_table[i])
  131. # else:
  132. # dirty_table.append(raw_table[i])
  133. # return table, dirty_table
  134. def predict(self):
  135. for r in self.parse():
  136. if r[0] in ['Heading', 'Normal'] and r[1]:
  137. tablename = r[1]
  138. logger.debug(tablename)
  139. elif r[0] == 'Table':
  140. # table = r[1]
  141. # table, dirty_table = self.clean_table(r[1])
  142. table, dirty_table = self.get_table(r[1])
  143. if not table:
  144. continue
  145. # 判断表是否为需要解析的表
  146. if any({'fulltype', 'type'} & {self.detect_type(i) for i in table[1]}):
  147. ############################### 数据库表名解析 ##############################
  148. if re.search("[a-zA-Z_\d]+", tablename):
  149. table_name = re.search("[a-zA-Z_]+", tablename).group()
  150. try:
  151. table_c_name = re.search('[\u4e00-\u9fa5]{3,}', tablename).group()
  152. except Exception as e:
  153. table_c_name = "未知表"
  154. logger.info(f"得到数据库表,表名:{table_name}\t猜测中文表名:{table_c_name}")
  155. else:
  156. table_name = "UnknowTable"
  157. table_c_name = "未知表"
  158. ############################### 表名解析结束 ###############################
  159. ############################# 表字段在此修改 ############################
  160. df = pd.DataFrame(table)
  161. df.columns = df.values.tolist()[0]
  162. df.drop([0], inplace=True)
  163. # # 表字段修正
  164. df.rename(columns={**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **PRIMARY_KEY_DICT, **FOREIGN_KEY_DICT}, inplace=True)
  165. df['table_name'] = table_name
  166. df['table_c_name'] = table_c_name
  167. for i in df.columns:
  168. if self.detect_type(df.loc[1, i]) != 'unknow':
  169. df.rename(columns={i: 'column_type'}, inplace=True)
  170. break
  171. # 判断字段是否允许为空,允许为空值的不可连接
  172. # for i in df.columns:
  173. # if sim.get_score(i, '允许值为空') > 0.7:
  174. # df['unique'] = df[i].apply(lambda x: str(uuid.uuid1()) if x != 'n' else '')
  175. # break
  176. # elif sim.get_score(i, '必填') > 0.7:
  177. # df['unique'] = df[i].apply(lambda x: '' if x != 'n' else str(uuid.uuid1()))
  178. # break
  179. # if 'unique' not in df.columns:
  180. # print("无法判断字段是否必填")
  181. # df['unique'] = ''
  182. ############################### 必填字段判断 ###############################
  183. if 'required' not in df.columns:
  184. logger.warning("无法判断字段是否必填,设定默认值必填,所有字段皆可关联!")
  185. df['required'] = ''
  186. ############################### 必填字段判断 ###############################
  187. # 注释字段必填
  188. if 'remark' not in df.columns:
  189. if 'c_name' not in df.columns:
  190. logger.warning(f"未找到注释字段,当前字段包含:{df.columns}")
  191. df['remark'] = ''
  192. else:
  193. logger.warning(f"未找到注释字段,使用字段中文名代替!,当前字段包含:{df.columns}")
  194. df['remark'] = df['c_name']
  195. ############################### 指定字段不可关联 ###############################
  196. df['remark'] = df['remark'] + df['column_type'].apply(lambda x: str(uuid.uuid1()) if x in ['date', 'datetime', 'blob', 'text'] else '')
  197. df['remark'] = df['remark'] + df['name'].apply(lambda x: str(uuid.uuid1()) if x in ['create_time', 'update_time'] else '')
  198. df['remark'] = df.apply(lambda x: x['name'] if not x['remark'] else x['remark'], axis=1)
  199. ############################### 指定字段不可关联 ###############################
  200. ############################### 为空字段不可关联 ###############################
  201. pass
  202. ############################### 为空字段不可关联 ###############################
  203. if not df.query(' name in ["id", "ID", "Id"] ').empty:
  204. idx = df.query(' name in ["id", "ID", "Id"] ').index[0]
  205. df.loc[idx, 'remark'] = ''.join([table_name, '_id'])
  206. # 判断唯一的依据
  207. logger.debug(f'''remark type: {type(df.loc[1, 'remark'])}''')
  208. logger.debug(f'''column_type type: {type(df.loc[1, 'column_type'])}''')
  209. df['vec'] = df['name'] + '_' + df['column_type'] + '_' + df['remark']
  210. # ############################### 表字段结束修改 ##############################
  211. # if self.draw:
  212. # self.G.add_node(table_name)
  213. # nodelist = [
  214. # (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
  215. # ]
  216. # self.G.add_nodes_from(nodelist)
  217. # edgelist = [
  218. # (table_name, node[0]) for node in nodelist
  219. # ]
  220. # self.G.add_edges_from(edgelist)
  221. # 创建表节点
  222. result = graph.nodes.match("表", name = table_name, c_name=table_c_name)
  223. if len(result) > 0:
  224. start_node = result.first()
  225. else:
  226. start_node = Node("表", name=table_name, c_name=table_c_name)
  227. graph.create(start_node)
  228. # 迭代表字段
  229. for item in df.to_dict(orient ='records'):
  230. # 确保属性插入正常,删除非字符串键值
  231. for key in set(item.keys()):
  232. if not isinstance(key, str):
  233. del item[key]
  234. ############################# 在此修改 ############################
  235. # 字段名设置合并条件
  236. name = item['name']
  237. if not item['vec']:
  238. item['vec'] = table_name + '_' + name
  239. ############################# 结束修改 ############################
  240. # 创建字段节点
  241. result = graph.nodes.match("字段", vec = item['vec'])
  242. if len(result) > 0:
  243. # 查询到相关字段,使用旧字段
  244. end_node = result.first()
  245. relationship = Relationship(start_node, "related", end_node, **{"name": item['name']})
  246. else:
  247. # 未查询到相关字段,创建节点
  248. end_node = Node("字段", **item)
  249. graph.create(end_node)
  250. relationship = Relationship(start_node, "has", end_node)
  251. # 创建表字段关系
  252. graph.merge(relationship)
  253. else:
  254. print("非标准表格", table)
  255. else:
  256. print(r)
  257. print(self.all_tables.columns)
  258. print(self.all_tables)
  259. def detect_type(self, text: str):
  260. fulltype = re.match(r'(\w+)\(\d+\)', text)
  261. if fulltype and (fulltype.group(1) in COLUMN_TYPES):
  262. return 'fulltype'
  263. elif text in COLUMN_TYPES:
  264. return 'type'
  265. else:
  266. return 'unknow'
  267. def get_table(self, raw_table):
  268. table = []
  269. dirty_table = []
  270. has_head = False
  271. for row in raw_table:
  272. if has_head:
  273. table.append(row)
  274. elif set(row) & set({**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **FOREIGN_KEY_DICT}.keys()):
  275. head = row
  276. has_head = True
  277. else:
  278. dirty_table.append(row)
  279. # for row in raw_table:
  280. # if get_head:
  281. # table.append(row)
  282. # continue
  283. # for col in row:
  284. # fulltype = re.match(r'(\w+)\(\d+\)', col)
  285. # if fulltype and (fulltype.group(1) in COLUMN_TYPES):
  286. # table.append(row)
  287. # get_head = True
  288. # break
  289. # elif col in COLUMN_TYPES:
  290. # table.append(row)
  291. # get_head = True
  292. # break
  293. # else:
  294. # head = row
  295. if table and (len(head) == len(table[0])) and (len(Counter([len(_) for _ in table]).keys()) == 1):
  296. table.insert(0, head)
  297. return table, dirty_table
  298. else:
  299. return None, dirty_table
  300. def draw(self):
  301. if self.draw:
  302. nx.draw(self.G, with_labels = True)
  303. plt.show()
  304. else:
  305. return "Draw is not enabled"
  306. if __name__ == '__main__':
  307. # path = '''data/数据库设计说明书.docx'''
  308. path = '''data/数据库设计文档.docx'''
  309. path = '''data/数据库设计(1).docx'''
  310. path = '''data/数据库设计(2).docx'''
  311. path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
  312. path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
  313. path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
  314. path = '''data/租房查询系统_数据库设计说明书_2.0.docx'''
  315. path = '''data/url-ukWkMKhnRgCvxVZt.docx'''
  316. path = '''data/url-qqp17mI32jTyozQt.docx'''
  317. path = '''data/电商-数据库详细设计说明书V0.4.docx'''
  318. word = Word(path)
  319. word.predict()