wordtable-24-05-14.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """
  2. 对 Word 中数据库设计表构建图谱
  3. 字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
  4. 字段类型判断:re.match(r'\w+\(\d+\)', s)
  5. """
  6. import re
  7. import logging
  8. import docx
  9. from docx import Document
  10. from docx.oxml.table import CT_Tbl
  11. from docx.oxml.text.paragraph import CT_P
  12. from docx.table import _Cell, Table
  13. from docx.text.paragraph import Paragraph
  14. import uuid
  15. import pandas as pd
  16. import networkx as nx
  17. import matplotlib.pyplot as plt
  18. from py2neo import Node, Graph, Relationship
  19. graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
  20. graph.delete_all()
  21. # 列类型判断
  22. with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
  23. coltypes = {i.strip() for i in fp.readlines()}
  24. # 列名判断
  25. with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
  26. coldict = {i.strip(): 'colname' for i in fp.readlines()}
  27. # 注释判断
  28. with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
  29. aliasdict = {i.strip(): 'alias' for i in fp.readlines()}
  30. class LoggerHandler(logging.Logger):
  31. def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
  32. super().__init__(name)
  33. self.setLevel(logging.INFO)
  34. self.fmt = logging.Formatter(fmt)
  35. self.set_console_handler(console_handler_level)
  36. def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
  37. ch = logging.StreamHandler()
  38. ch.setLevel(console_handler_level)
  39. ch.setFormatter(self.fmt)
  40. self.addHandler(ch)
  41. logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
  42. class Word:
  43. def __init__(self, path: str, draw: bool = False) -> None:
  44. self.draw = draw
  45. self.doc = Document(path)
  46. if draw:
  47. self.G = nx.Graph()
  48. def iter_block_item(self, parent):
  49. if isinstance(parent, docx.document.Document):
  50. parent_elm = parent.element.body
  51. elif isinstance(parent, _Cell):
  52. parent_elm = parent._tc
  53. else:
  54. raise ValueError("something error")
  55. for child in parent_elm.iterchildren():
  56. if isinstance(child, CT_P):
  57. yield Paragraph(child, parent)
  58. elif isinstance(child, CT_Tbl):
  59. yield Table(child, parent)
  60. def parse(self) -> tuple:
  61. for block in self.iter_block_item(self.doc):
  62. if block.style.name == 'Heading 1' and block.text:
  63. yield ('Heading', block.text)
  64. elif block.style.name == 'Heading 2' and block.text:
  65. yield ('Heading', block.text)
  66. elif block.style.name == 'Heading 3' and block.text:
  67. yield ('Heading', block.text)
  68. elif block.style.name == 'Heading 4' and block.text:
  69. yield ('Heading', block.text)
  70. elif block.style.name == 'Heading 5' and block.text:
  71. yield ('Heading', block.text)
  72. elif block.style.name == 'Heading 6' and block.text:
  73. yield ('Heading', block.text)
  74. elif block.style.name == 'Normal' and block.text:
  75. yield ('Normal', block.text)
  76. elif block.style.name == 'Table Grid':
  77. tables = []
  78. for row in block.rows:
  79. rows = []
  80. for cell in row.cells:
  81. for paragraph in cell.paragraphs:
  82. rows.append(paragraph.text.strip())
  83. tables.append(rows)
  84. yield ('Table', tables)
  85. elif block.style.name == 'Normal Table':
  86. tables = []
  87. for row in block.rows:
  88. rows = []
  89. for cell in row.cells:
  90. for paragraph in cell.paragraphs:
  91. rows.append(paragraph.text.strip())
  92. tables.append(rows)
  93. yield ('Table', tables)
  94. def predict(self):
  95. for r in self.parse():
  96. if r[0] in ['Heading', 'Normal']:
  97. tablename = r[1]
  98. logger.debug(tablename)
  99. elif r[0] == 'Table':
  100. # 判断表是否为需要解析的表
  101. if any(coltypes & {i.lower() for i in r[1][1]}):
  102. # 数据库表名解析
  103. if re.search("[a-zA-Z_\d]+", tablename):
  104. tablename = re.search("[a-zA-Z_]+", tablename).group()
  105. logger.info(f"得到数据库表,表名:{tablename}")
  106. ############################# 在此修改 ############################
  107. df = pd.DataFrame(r[1])
  108. df.columns = df.values.tolist()[0]
  109. df.drop([0], inplace=True)
  110. ############################# 结束修改 ############################
  111. if self.draw:
  112. self.G.add_node(tablename)
  113. nodelist = [
  114. (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
  115. ]
  116. self.G.add_nodes_from(nodelist)
  117. edgelist = [
  118. (tablename, node[0]) for node in nodelist
  119. ]
  120. self.G.add_edges_from(edgelist)
  121. # 创建表节点
  122. start_node = Node("表", name=tablename)
  123. graph.create(start_node)
  124. ############################### 在此修改 ##############################
  125. # 表字段修正
  126. df.rename(columns=coldict, inplace=True)
  127. df.rename(columns=aliasdict, inplace=True)
  128. df['tablename'] = tablename
  129. df['fullname'] = df['tablename'] + '_' + df['colname']
  130. ############################### 结束修改 ##############################
  131. # 别名字段必填
  132. if 'alias' not in df.columns:
  133. logger.warning(f"未找到注释字段,当前字段包含:{df.columns}")
  134. df['alias'] = ''
  135. # 迭代表字段
  136. for item in df.to_dict(orient ='records'):
  137. # 确保属性插入正常,删除非字符串键值
  138. for key in set(item.keys()):
  139. if not isinstance(key, str):
  140. del item[key]
  141. ############################# 在此修改 ############################
  142. # 字段名设置合并条件
  143. colname = item['colname']
  144. if not item['alias']:
  145. item['alias'] = tablename + '_' + colname
  146. ############################# 结束修改 ############################
  147. # 创建字段节点
  148. result = graph.nodes.match("字段", alias = item['alias'])
  149. if len(result) > 0:
  150. # 查询到相关字段,使用旧字段
  151. end_node = result.first()
  152. relationship = Relationship(start_node, "foreignkey", end_node, **{"name": item['colname']})
  153. else:
  154. # 未查询到相关字段,创建节点
  155. end_node = Node("字段", name=colname, **item)
  156. graph.create(end_node)
  157. relationship = Relationship(start_node, "has", end_node)
  158. # 创建表字段关系
  159. graph.merge(relationship)
  160. def draw(self):
  161. if self.draw:
  162. nx.draw(self.G, with_labels = True)
  163. plt.show()
  164. else:
  165. return "Draw is not enabled"
  166. if __name__ == '__main__':
  167. path = '''data/数据库设计文档.docx'''
  168. path = '''data/数据库设计(1).docx'''
  169. # path = '''data/数据库设计(2).docx'''
  170. # path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
  171. # path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
  172. # path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
  173. word = Word(path)
  174. word.predict()