wordtable-24-05-12.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. """
  2. 对 Word 中数据库设计表构建图谱
  3. 字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
  4. """
  5. import re
  6. import logging
  7. import docx
  8. from docx import Document
  9. from docx.oxml.table import CT_Tbl
  10. from docx.oxml.text.paragraph import CT_P
  11. from docx.table import _Cell, Table
  12. from docx.text.paragraph import Paragraph
  13. import uuid
  14. import pandas as pd
  15. import networkx as nx
  16. import matplotlib.pyplot as plt
  17. from py2neo import Node, Graph, Relationship
  18. graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
  19. graph.delete_all()
  20. coltypes = {'int', 'bigint', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
  21. coldict = {'名称': 'colname', '字段名': 'colname', 'Field Name': 'colname', '关联字段': 'alias', 'Alias': 'alias'}
  22. class LoggerHandler(logging.Logger):
  23. def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
  24. super().__init__(name)
  25. self.setLevel(logging.INFO)
  26. self.fmt = logging.Formatter(fmt)
  27. self.set_console_handler(console_handler_level)
  28. def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
  29. ch = logging.StreamHandler()
  30. ch.setLevel(console_handler_level)
  31. ch.setFormatter(self.fmt)
  32. self.addHandler(ch)
  33. logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
  34. class Word:
  35. def __init__(self, path: str, draw: bool = False) -> None:
  36. self.draw = draw
  37. self.doc = Document(path)
  38. if draw:
  39. self.G = nx.Graph()
  40. def iter_block_item(self, parent):
  41. if isinstance(parent, docx.document.Document):
  42. parent_elm = parent.element.body
  43. elif isinstance(parent, _Cell):
  44. parent_elm = parent._tc
  45. else:
  46. raise ValueError("something error")
  47. for child in parent_elm.iterchildren():
  48. if isinstance(child, CT_P):
  49. yield Paragraph(child, parent)
  50. elif isinstance(child, CT_Tbl):
  51. yield Table(child, parent)
  52. def parse(self) -> tuple:
  53. for block in self.iter_block_item(self.doc):
  54. if block.style.name == 'Heading 1' and block.text:
  55. yield ('Heading', block.text)
  56. elif block.style.name == 'Heading 2' and block.text:
  57. yield ('Heading', block.text)
  58. elif block.style.name == 'Heading 3' and block.text:
  59. yield ('Heading', block.text)
  60. elif block.style.name == 'Heading 4' and block.text:
  61. yield ('Heading', block.text)
  62. elif block.style.name == 'Heading 5' and block.text:
  63. yield ('Heading', block.text)
  64. elif block.style.name == 'Heading 6' and block.text:
  65. yield ('Heading', block.text)
  66. elif block.style.name == 'Normal' and block.text:
  67. yield ('Normal', block.text)
  68. elif block.style.name == 'Table Grid':
  69. tables = []
  70. for row in block.rows:
  71. rows = []
  72. for cell in row.cells:
  73. for paragraph in cell.paragraphs:
  74. rows.append(paragraph.text.strip())
  75. tables.append(rows)
  76. yield ('Table', tables)
  77. elif block.style.name == 'Normal Table':
  78. tables = []
  79. for row in block.rows:
  80. rows = []
  81. for cell in row.cells:
  82. for paragraph in cell.paragraphs:
  83. rows.append(paragraph.text.strip())
  84. tables.append(rows)
  85. yield ('Table', tables)
  86. def predict(self):
  87. for r in self.parse():
  88. if r[0] in ['Heading', 'Normal']:
  89. tablename = r[1]
  90. logger.debug(tablename)
  91. if r[0] == 'Table':
  92. # 判断表是否为需要解析的表
  93. if any(coltypes & {i.lower() for i in r[1][1]}):
  94. # 数据库表名解析
  95. if re.search("[a-zA-Z_]+", tablename):
  96. tablename = re.search("[a-zA-Z_]+", tablename).group()
  97. logger.info(f"得到数据库表,表名:{tablename}")
  98. df = pd.DataFrame(r[1])
  99. df.columns = df.values.tolist()[0]
  100. df.drop([0], inplace=True)
  101. if self.draw:
  102. self.G.add_node(tablename)
  103. nodelist = [
  104. (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
  105. ]
  106. self.G.add_nodes_from(nodelist)
  107. edgelist = [
  108. (tablename, node[0]) for node in nodelist
  109. ]
  110. self.G.add_edges_from(edgelist)
  111. # 创建表节点
  112. start_node = Node("表", name=tablename)
  113. graph.merge(start_node, "表", "name")
  114. # 表字段修正
  115. df.rename(columns=coldict, inplace=True)
  116. # 别名字段必填
  117. if 'alias' not in df.columns:
  118. logger.warning(f"未找到Alias字段,当前字段包含:{df.columns}")
  119. df['alias'] = ''
  120. # 迭代表字段
  121. for item in df.to_dict(orient ='records'):
  122. # 确保属性插入正常
  123. for key in set(item.keys()):
  124. if not isinstance(key, str):
  125. del item[key]
  126. # 字段名设置合并条件
  127. colname = item['colname']
  128. if not item['alias']:
  129. item['alias'] = tablename + '_' + colname
  130. # 创建字段节点
  131. end_node = Node("字段", name=colname, **item)
  132. graph.merge(end_node, "字段", "alias")
  133. # 创建表字段关系
  134. # relation = Relationship(start_node, 'has', end_node)
  135. HAS = Relationship.type("has")
  136. graph.merge(HAS(start_node, end_node))
  137. def draw(self):
  138. if self.draw:
  139. nx.draw(self.G, with_labels = True)
  140. plt.show()
  141. else:
  142. return "Draw is not enabled"
  143. if __name__ == '__main__':
  144. path = '''数据库设计文档.docx'''
  145. path = '''数据库设计(1).docx'''
  146. path = '''数据库设计(2).docx'''
  147. # path = '''国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
  148. # path = '''FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
  149. # path = '''中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
  150. word = Word(path)
  151. word.predict()