vor 1 Jahr · b2543f9c1c
--- a/README.md
+++ b/README.md
@@ -0,0 +1,63 @@
 
				+# SQLKnowledgeGraph
			
 
				+从无到有搭建一个以数据库设计文档为中心的知识图谱，并以该知识图谱完成自动问答与分析服务。
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+table_1：
			
 
				+
			
 
				+| 列名 | 备注 |
			
 
				+|---|---|
			
 
				+| id | ID |
			
 
				+| name | 用户名 |
			
 
				+| create_time | 创建时间 |
			
 
				+
			
 
				+table_2：
			
 
				+
			
 
				+| 列名 | 备注 |
			
 
				+|---|---|
			
 
				+| id | ID |
			
 
				+| file_id | 文件ID |
			
 
				+| create_by | 创建者 |
			
 
				+| create_time | 创建时间 |
			
 
				+
			
 
				+table_3:
			
 
				+
			
 
				+| 列名 | 备注 |
			
 
				+|---|---|
			
 
				+| id | ID |
			
 
				+| file | 文件名 |
			
 
				+| create_time | 创建时间 |
			
 
				+
			
 
				+
			
 
				+
			
 
				+字段类型完全相同的情况下：
			
 
				+
			
 
				+一、通过列名+表名向量的问题：
			
 
				+
			
 
				+1、表1、表2 和 表3 具有相同 id 字段，但应判断无关  
			
 
				+2、表1、表2 和 表3 具有相同 create_time 字段，但应判断无关  
			
 
				+3、表3 的 id 字段 和 表2 file_id 字段应为同一字段，因判断为外键关系  
			
 
				+4、表1 的 id 字段 和 表2 的 update_by 字段应为同一字段，因判断为外键关系  
			
 
				+
			
 
				+二、通过表名 + 备注向量的问题：
			
 
				+
			
 
				+1、表1、表2 和 表3 具有相同 id 字段，但应判断无关  
			
 
				+2、表1、表2 和 表3 具有相同 create_time 字段，但应判断无关  
			
 
				+3、表1 的 id 字段 和 表2 的 update_by 字段应为同一字段，因判断为外键关系，向量化比较时受 表3 file 字段影响
			
 
				+
			
 
				+三、通过 <表名, 表中文名, 列名, 备注> 向量:
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+字段转换
			
 
				+
			
 
				+| 表名      | 表中文名    | 列名     | 列类型  | 列详细信息 | 列唯一表示 |
			
 
				+| ---       | ---        | ---     | ---     | ---       | ---       |
			
 
				+| tablename | tablecname | colname | coltype | detail    | vec       |
			
 
				+| name      | c_name     | name    | type    | detail    | vec       |
			
 
				+
			
 
				+
			
--- a/富士通识别平台数据库设计说明书.docx
+++ b/富士通识别平台数据库设计说明书.docx
--- a/data/url-qqp17mI32jTyozQt.docx
+++ b/data/url-qqp17mI32jTyozQt.docx
--- a/data/url-ukWkMKhnRgCvxVZt.docx
+++ b/data/url-ukWkMKhnRgCvxVZt.docx
--- a/data/中国跳水队智能辅助教练系统-国际比赛数据数据库设计说明书.docx
+++ b/data/中国跳水队智能辅助教练系统-国际比赛数据数据库设计说明书.docx
--- a/data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx
+++ b/data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx
--- a/data/数据库设计(1).docx
+++ b/data/数据库设计(1).docx
--- a/data/数据库设计(2).docx
+++ b/data/数据库设计(2).docx
--- a/data/数据库设计文档.docx
+++ b/data/数据库设计文档.docx
--- a/data/数据库设计说明书.docx
+++ b/data/数据库设计说明书.docx
--- a/data/电商-数据库详细设计说明书V0.4.docx
+++ b/data/电商-数据库详细设计说明书V0.4.docx
--- a/data/租房查询系统_数据库设计说明书_2.0.docx
+++ b/data/租房查询系统_数据库设计说明书_2.0.docx
--- a/dict/columnname.txt
+++ b/dict/columnname.txt
@@ -0,0 +1,5 @@
 
				+名称
			
 
				+字段名
			
 
				+Field Name
			
 
				+字段代码
			
 
				+代码
			
--- a/dict/columntype.txt
+++ b/dict/columntype.txt
@@ -0,0 +1,14 @@
 
				+int
			
 
				+bigint
			
 
				+float
			
 
				+double
			
 
				+decimal
			
 
				+date
			
 
				+datetime
			
 
				+char
			
 
				+varchar
			
 
				+text
			
 
				+longtext
			
 
				+blob
			
 
				+bool
			
 
				+boolean
			
--- a/dict/comment.txt
+++ b/dict/comment.txt
@@ -0,0 +1,5 @@
 
				+Alias
			
 
				+关联字段
			
 
				+Description
			
 
				+说明
			
 
				+描述
			
--- a/excel2sql.py
+++ b/excel2sql.py
@@ -0,0 +1,24 @@
 
				+import pandas as pd
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+
			
 
				+graph = Graph('http://192.168.1.202:7474/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+io = '''数据表结构.xlsx'''
			
 
				+
			
 
				+df = pd.read_excel(io, sheet_name='Sheet1', header=[0])
			
 
				+
			
 
				+df.字段描述.fillna(value='', inplace=True)
			
 
				+df['字段'] = df.字段.str.upper()
			
 
				+
			
 
				+for row in df.itertuples():
			
 
				+    try:
			
 
				+        start_node = Node("表", name=row.表, c_name=row.表名)
			
 
				+        end_node   = Node("列", name=row.字段, type=row.字段类型, detail=row.字段描述)
			
 
				+        relation   = Relationship(start_node, 'has', end_node)
			
 
				+        graph.merge(start_node, "表", "name")
			
 
				+        graph.merge(end_node, "列", "name")
			
 
				+        graph.merge(relation, "值", "名称")
			
 
				+    except:
			
 
				+        print(row)
			
--- a/jiebasim.py
+++ b/jiebasim.py
@@ -0,0 +1,168 @@
 
				+# import jieba
			
 
				+# import numpy as np
			
 
				+# import re
			
 
				+ 
			
 
				+# def get_word_vector(s1, s2):
			
 
				+#     """
			
 
				+#     :param s1: 句子1
			
 
				+#     :param s2: 句子2
			
 
				+#     :return: 返回句子的余弦相似度
			
 
				+#     """
			
 
				+#     # 分词
			
 
				+#     cut1 = jieba.cut(s1)
			
 
				+#     cut2 = jieba.cut(s2)
			
 
				+#     list_word1 = (','.join(cut1)).split(',')
			
 
				+#     list_word2 = (','.join(cut2)).split(',')
			
 
				+
			
 
				+#     # 列出所有的词,取并集
			
 
				+#     key_word = list(set(list_word1 + list_word2))
			
 
				+#     # 给定形状和类型的用0填充的矩阵存储向量
			
 
				+#     word_vector1 = np.zeros(len(key_word))
			
 
				+#     word_vector2 = np.zeros(len(key_word))
			
 
				+
			
 
				+#     # 计算词频
			
 
				+#     # 依次确定向量的每个位置的值
			
 
				+#     for i in range(len(key_word)):
			
 
				+#         # 遍历key_word中每个词在句子中的出现次数
			
 
				+#         for j in range(len(list_word1)):
			
 
				+#             if key_word[i] == list_word1[j]:
			
 
				+#                 word_vector1[i] += 1
			
 
				+#         for k in range(len(list_word2)):
			
 
				+#             if key_word[i] == list_word2[k]:
			
 
				+#                 word_vector2[i] += 1
			
 
				+
			
 
				+#     # 输出向量
			
 
				+#     print(word_vector1)
			
 
				+#     print(word_vector2)
			
 
				+#     return word_vector1, word_vector2
			
 
				+
			
 
				+
			
 
				+# def cos_dist(vec1,vec2):
			
 
				+#     """
			
 
				+#     :param vec1: 向量1
			
 
				+#     :param vec2: 向量2
			
 
				+#     :return: 返回两个向量的余弦相似度
			
 
				+#     """
			
 
				+#     dist1 = float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
			
 
				+#     return dist1
			
 
				+
			
 
				+
			
 
				+# if __name__ == '__main__':
			
 
				+#     s1 = "允许空值"
			
 
				+#     s2 = "是否为空"
			
 
				+#     vec1, vec2 = get_word_vector(s1, s2)
			
 
				+#     dist1 = cos_dist(vec1, vec2)
			
 
				+#     print(dist1)
			
 
				+
			
 
				+
			
 
				+from transformers import AutoTokenizer, TFAutoModel
			
 
				+import tensorflow as tf
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+# 加载模型
			
 
				+model_name = "bert-base-uncased"
			
 
				+tokenizer = AutoTokenizer.from_pretrained(model_name)
			
 
				+model = TFAutoModel.from_pretrained(model_name,
			
 
				+                                    output_hidden_states=True)  # Whether the model returns all hidden-states.
			
 
				+
			
 
				+# 输入测试句子
			
 
				+utt = ['今天的月亮又大又圆', '月亮真的好漂亮啊', '今天去看电影吧', "爱情睡醒了,天琪抱着小贝进酒店", "侠客行风万里"]
			
 
				+inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
			
 
				+outputs = model(inputs)
			
 
				+hidden_states = outputs[2]  # 获得各个隐藏层输出
			
 
				+"""
			
 
				+解释下输出(hidden_states)：
			
 
				+1. The layer number (13 layers)
			
 
				+2. The batch number (5 sentence) 也就是输入句子的个数
			
 
				+3. The word / token number (64 tokens in our sentence) 也就是max_length
			
 
				+4. The hidden unit / feature number (768 features)
			
 
				+
			
 
				+疑惑点:
			
 
				+1.为啥是13层？bert不是12层吗？
			
 
				+第一层是输入的嵌入层，其余12层才是bert的
			
 
				+"""
			
 
				+print("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
			
 
				+
			
 
				+layer_i = 0
			
 
				+print("Number of batches:", len(hidden_states[layer_i]))
			
 
				+
			
 
				+batch_i = 0
			
 
				+print("Number of tokens:", len(hidden_states[layer_i][batch_i]))
			
 
				+
			
 
				+token_i = 0
			
 
				+print("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
			
 
				+
			
 
				+# For the 5th token in our sentence, select its feature values from layer 5.
			
 
				+token_i = 5
			
 
				+layer_i = 5
			
 
				+vec = hidden_states[layer_i][batch_i][token_i]
			
 
				+
			
 
				+# Plot the values as a histogram to show their distribution.
			
 
				+plt.figure(figsize=(10, 10))
			
 
				+plt.hist(vec, bins=200)
			
 
				+plt.show()
			
 
				+
			
 
				+
			
 
				+# Concatenate the tensors for all layers. We use `stack` here to
			
 
				+# create a new dimension in the tensor.
			
 
				+sentence_embeddings = tf.stack(hidden_states, axis=0)  # 在维度0的位置插入，也就是把13放入最前面
			
 
				+print(f"sentence_embeddings.shape : {sentence_embeddings.shape}")
			
 
				+
			
 
				+# 调换维度，使每个词都有13层的嵌入表示
			
 
				+sentence_embeddings_perm = tf.transpose(sentence_embeddings, perm=[1, 2, 0, 3])
			
 
				+print(f"sentence_embeddings_perm.shape : {sentence_embeddings_perm.shape}")
			
 
				+
			
 
				+# 获取词的稠密向量
			
 
				+## 第一种方式：拼接后四层的稠密向量
			
 
				+for sentence_embedding in sentence_embeddings_perm:  # 获取每个句子的embedding
			
 
				+    print(f"sentence_embedding.shape: {sentence_embedding.shape}")
			
 
				+    token_vecs_cat = []
			
 
				+    for token_embedding in sentence_embedding:  # 获取句子每个词的embedding
			
 
				+        print(f"token_embedding.shape : {token_embedding.shape}")
			
 
				+        cat_vec = tf.concat([token_embedding[-1], token_embedding[-2], token_embedding[-3], token_embedding[-4]], axis=0)
			
 
				+        print(f"cat_vec.shape : {cat_vec.shape}")
			
 
				+        token_vecs_cat.append(cat_vec)
			
 
				+    print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
			
 
				+
			
 
				+## 第二种方式：加和后四层的稠密向量
			
 
				+for sentence_embedding in sentence_embeddings_perm:  # 获取每个句子的embedding
			
 
				+    print(f"sentence_embedding.shape: {sentence_embedding.shape}")
			
 
				+    token_vecs_cat = []
			
 
				+    for token_embedding in sentence_embedding:  # 获取句子每个词的embedding
			
 
				+        print(f"token_embedding.shape : {token_embedding.shape}")
			
 
				+        cat_vec = sum(token_embedding[-4:])
			
 
				+        print(f"cat_vec.shape : {cat_vec.shape}")
			
 
				+        token_vecs_cat.append(cat_vec)
			
 
				+    print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
			
 
				+
			
 
				+
			
 
				+# 获取句子的稠密向量
			
 
				+## 平均每个token倒数第二层的稠密向量
			
 
				+token_vecs = sentence_embeddings[-2]
			
 
				+print(f"token_vecs.shape : {token_vecs.shape}")
			
 
				+sentences_embedding = tf.reduce_mean(token_vecs, axis=1)
			
 
				+print(f"sentences_embedding.shape : {sentences_embedding.shape}")
			
 
				+
			
 
				+
			
 
				+# 计算余弦相似度
			
 
				+## 不同句子间的相似度
			
 
				+tensor_test = sentences_embedding[0]
			
 
				+consine_sim_tensor = tf.keras.losses.cosine_similarity(tensor_test, sentences_embedding)
			
 
				+print(f"consine_sim_tensor : {consine_sim_tensor}")
			
 
				+
			
 
				+
			
 
				+##探讨下相同词bank在不同上下文时其vector的相似度
			
 
				+utt = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]
			
 
				+inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=22)
			
 
				+outputs = model(inputs)
			
 
				+hidden_states = outputs[2]  # 获得各个隐藏层输出
			
 
				+tokens_embedding = tf.reduce_sum(hidden_states[-4:], axis=0) # 使用加和方式
			
 
				+bank_vault = tokens_embedding[0][6]
			
 
				+bank_robber = tokens_embedding[0][10]
			
 
				+river_bank = tokens_embedding[0][19]
			
 
				+consine_sim_tensor = tf.keras.losses.cosine_similarity(bank_vault, [bank_robber, river_bank])
			
 
				+print(f"consine_sim_tensor : {consine_sim_tensor}")
			
 
				+# consine_sim_tensor : [-0.93863535 -0.69570863]
			
 
				+# 可以看出bank_vault(银行金库)和bank_robber(银行抢劫犯)中的bank相似度更高些，合理！
			
 
				+
			
 
				+
			
--- a/test.py
+++ b/test.py
@@ -0,0 +1,394 @@
 
				+"""
			
 
				+对 Word 中数据库设计表构建图谱
			
 
				+
			
 
				+字段名判断依据：1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
			
 
				+
			
 
				+字段类型判断：re.match(r'\w+\(\d+\)', s)
			
 
				+
			
 
				+1、判断是否标准表 [依据]-> 判断是否包含列类型 -> 找到列
			
 
				+2、判断列名 [是]-> 判断是否符合列名规范
			
 
				+3、判断列中文名 [规则]-> 找到列中文名 
			
 
				+4、判断列类型 [是]-> 判断是否符合列规范
			
 
				+5、判断主键 [未找到主键]-> 判断是否是ID
			
 
				+6、判断是否必填字段 [依据]-> 分类[是否为空|是否必填] -> [未找到字段]-> 设定默认值必填
			
 
				+7、判断列注释 [未找到注释]-> 中文名 [未找到中文名]-> 表名+列名 -> [列类型为bool、时间、流等]-> 添加UUID
			
 
				+8、字段唯一判断条件 <列名,列类型,注释>
			
 
				+"""
			
 
				+import re
			
 
				+import uuid
			
 
				+import logging
			
 
				+from collections import Counter
			
 
				+
			
 
				+import docx
			
 
				+from docx import Document
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				+from docx.table import _Cell, Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				+import jieba
			
 
				+import pandas as pd
			
 
				+
			
 
				+import networkx as nx
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+# from text2vec import Similarity
			
 
				+# sim = Similarity()
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+
			
 
				+# # 列类型判断
			
 
				+# with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     COLUMN_TYPES = {i.strip() for i in fp.readlines()}
			
 
				+# # 列名判断
			
 
				+# with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     COLUMN_DICT = {i.strip(): 'name' for i in fp.readlines()}
			
 
				+# # 注释判断
			
 
				+# with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     REMARK_DICT = {i.strip(): 'remark' for i in fp.readlines()}
			
 
				+
			
 
				+
			
 
				+COLUMN_TYPES = {'int', 'bigint', 'tinyint', 'smallint', 'bigint unsigned', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
			
 
				+
			
 
				+TYPE_DICT = {'类型': 'column_type', '数据类型': 'column_type'}
			
 
				+
			
 
				+COLUMN_DICT = {'名称': 'name', '字段名': 'name', 'field name': 'name', '字段代码': 'name', '代码': 'name', '物理字段名': 'name'}
			
 
				+
			
 
				+C_NAME_DICT = {'字段中文名': 'c_name', '中文含义': 'c_name', '名字': 'c_name', '字段名称': 'c_name', '逻辑字段名': 'c_name'}
			
 
				+
			
 
				+REMARK_DICT = {'Alias': 'remark', 'description': 'remark', '说明': 'remark', '描述': 'remark', '备注': 'remark'}
			
 
				+
			
 
				+REQUIRED_DICT = {'空/非空': 'required', '可不可以为空': 'required', '是否为空': 'required', '允许空值': 'required', '是否必填': 'required', '空值': 'required'}
			
 
				+
			
 
				+PRIMARY_KEY_DICT = {'主键': 'primary_key'}
			
 
				+
			
 
				+FOREIGN_KEY_DICT = {'外键': 'foreign_key'}
			
 
				+
			
 
				+
			
 
				+class LoggerHandler(logging.Logger):
			
 
				+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
			
 
				+        super().__init__(name)
			
 
				+        self.setLevel(logging.INFO)
			
 
				+        self.fmt = logging.Formatter(fmt)
			
 
				+        self.set_console_handler(console_handler_level)
			
 
				+
			
 
				+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
			
 
				+        ch = logging.StreamHandler()
			
 
				+        ch.setLevel(console_handler_level)
			
 
				+        ch.setFormatter(self.fmt)
			
 
				+        self.addHandler(ch)
			
 
				+
			
 
				+
			
 
				+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
			
 
				+
			
 
				+
			
 
				+class Word:
			
 
				+    def __init__(self, path: str, draw: bool = False) -> None:
			
 
				+        self.draw = draw
			
 
				+        self.doc = Document(path)
			
 
				+        if draw:
			
 
				+            self.G = nx.Graph()
			
 
				+        self.namecount = dict({})
			
 
				+        self.all_tables = pd.DataFrame()
			
 
				+
			
 
				+    def iter_block_item(self, parent):
			
 
				+        if isinstance(parent, docx.document.Document):
			
 
				+            parent_elm = parent.element.body
			
 
				+        elif isinstance(parent, _Cell):
			
 
				+            parent_elm = parent._tc
			
 
				+        else:
			
 
				+            raise ValueError("something error")
			
 
				+
			
 
				+        for child in parent_elm.iterchildren():
			
 
				+            if isinstance(child, CT_P):
			
 
				+                yield Paragraph(child, parent)
			
 
				+            elif isinstance(child, CT_Tbl):
			
 
				+                yield Table(child, parent)
			
 
				+
			
 
				+    def parse(self) -> tuple:
			
 
				+        for block in self.iter_block_item(self.doc):
			
 
				+            if block.style.name == 'Heading 1' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 2' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 3' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 4' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 5' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 6' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Normal' and block.text:
			
 
				+                yield ('Normal', block.text.lower())
			
 
				+            elif block.style.name == 'Table Grid':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip().lower())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.style.name == 'Normal Table':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip().lower())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.text:
			
 
				+                yield ('Unknow', block)
			
 
				+
			
 
				+    # def clean_table(self, raw_table):
			
 
				+    #     table = []
			
 
				+    #     dirty_table = []
			
 
				+    #     while raw_table and '' in raw_table[0]:
			
 
				+    #         raw_table = raw_table[1:]
			
 
				+
			
 
				+    #     # 表格预处理
			
 
				+    #     rowslen = [len(row) for row in raw_table]
			
 
				+
			
 
				+    #     if not rowslen:
			
 
				+    #         return None
			
 
				+
			
 
				+    #     rowlen = Counter(rowslen).most_common(1)[0][0]
			
 
				+    #     for i,l in enumerate(rowslen):
			
 
				+    #         if l == rowlen:
			
 
				+    #             table.append(raw_table[i])
			
 
				+    #         else:
			
 
				+    #             dirty_table.append(raw_table[i])
			
 
				+    #     return table, dirty_table
			
 
				+
			
 
				+    def predict(self):
			
 
				+        for r in self.parse():
			
 
				+            if r[0] in ['Heading', 'Normal'] and r[1]:
			
 
				+                tablename = r[1]
			
 
				+                logger.debug(tablename)
			
 
				+            elif r[0] == 'Table':
			
 
				+
			
 
				+                # table = r[1]
			
 
				+
			
 
				+                # table, dirty_table = self.clean_table(r[1])
			
 
				+                table, dirty_table = self.get_table(r[1])
			
 
				+
			
 
				+                if not table:
			
 
				+                    continue
			
 
				+
			
 
				+                # 判断表是否为需要解析的表
			
 
				+                if any({'fulltype', 'type'} & {self.detect_type(i) for i in table[1]}):
			
 
				+
			
 
				+                    ############################### 数据库表名解析 ##############################
			
 
				+                    if re.search("[a-zA-Z_\d]+", tablename):
			
 
				+                        table_name = re.search("[a-zA-Z_]+", tablename).group()
			
 
				+                        try:
			
 
				+                            table_c_name = re.search('[\u4e00-\u9fa5]{3,}', tablename).group()
			
 
				+                        except Exception as e:
			
 
				+                            table_c_name = "未知表"
			
 
				+                        logger.info(f"得到数据库表，表名：{table_name}\t猜测中文表名：{table_c_name}")
			
 
				+                    else:
			
 
				+                        table_name = "UnknowTable"
			
 
				+                        table_c_name = "未知表"
			
 
				+                    ############################### 表名解析结束 ###############################
			
 
				+
			
 
				+
			
 
				+                    ############################# 表字段在此修改 ############################
			
 
				+                    df = pd.DataFrame(table)
			
 
				+                    df.columns = df.values.tolist()[0]
			
 
				+                    df.drop([0], inplace=True)
			
 
				+
			
 
				+                    # # 表字段修正
			
 
				+                    df.rename(columns={**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **PRIMARY_KEY_DICT, **FOREIGN_KEY_DICT}, inplace=True)
			
 
				+
			
 
				+                    df['table_name'] = table_name
			
 
				+                    df['table_c_name'] = table_c_name
			
 
				+
			
 
				+                    for i in df.columns:
			
 
				+                        if self.detect_type(df.loc[1, i]) != 'unknow':
			
 
				+                            df.rename(columns={i: 'column_type'}, inplace=True)
			
 
				+                            break
			
 
				+
			
 
				+                    # 判断字段是否允许为空，允许为空值的不可连接
			
 
				+                    # for i in df.columns:
			
 
				+                    #     if sim.get_score(i, '允许值为空') > 0.7:
			
 
				+                    #         df['unique'] = df[i].apply(lambda x: str(uuid.uuid1()) if x != 'n' else '')
			
 
				+                    #         break
			
 
				+                    #     elif sim.get_score(i, '必填') > 0.7:
			
 
				+                    #         df['unique'] = df[i].apply(lambda x: '' if x != 'n' else str(uuid.uuid1()))
			
 
				+                    #         break
			
 
				+                    # if 'unique' not in df.columns:
			
 
				+                    #     print("无法判断字段是否必填")
			
 
				+                    #     df['unique'] = ''
			
 
				+
			
 
				+                    ############################### 必填字段判断 ###############################
			
 
				+                    if 'required' not in df.columns:
			
 
				+                        logger.warning("无法判断字段是否必填，设定默认值必填，所有字段皆可关联！")
			
 
				+                        df['required'] = ''
			
 
				+                    ############################### 必填字段判断 ###############################
			
 
				+
			
 
				+                    # 注释字段必填
			
 
				+                    if 'remark' not in df.columns:
			
 
				+                        if 'c_name' not in df.columns:
			
 
				+                            logger.warning(f"未找到注释字段，当前字段包含：{df.columns}")
			
 
				+                            df['remark'] = ''
			
 
				+                        else:
			
 
				+                            logger.warning(f"未找到注释字段，使用字段中文名代替！，当前字段包含：{df.columns}")
			
 
				+                            df['remark'] = df['c_name']
			
 
				+
			
 
				+                    ############################### 指定字段不可关联 ###############################
			
 
				+                    df['remark'] = df['remark'] + df['column_type'].apply(lambda x: str(uuid.uuid1()) if x in ['date', 'datetime', 'blob', 'text'] else '')
			
 
				+                    df['remark'] = df['remark'] + df['name'].apply(lambda x: str(uuid.uuid1()) if x in ['create_time', 'update_time'] else '')
			
 
				+                    df['remark'] = df.apply(lambda x: x['name'] if not x['remark'] else x['remark'], axis=1)
			
 
				+                    ############################### 指定字段不可关联 ###############################
			
 
				+
			
 
				+                    ############################### 为空字段不可关联 ###############################
			
 
				+                    pass
			
 
				+                    ############################### 为空字段不可关联 ###############################
			
 
				+
			
 
				+
			
 
				+                    if not df.query(' name in ["id", "ID", "Id"] ').empty:
			
 
				+                        idx = df.query(' name in ["id", "ID", "Id"] ').index[0]
			
 
				+                        df.loc[idx, 'remark'] = ''.join([table_name, '_id'])
			
 
				+
			
 
				+                    # 判断唯一的依据
			
 
				+                    logger.debug(f'''remark type: {type(df.loc[1, 'remark'])}''')
			
 
				+                    logger.debug(f'''column_type type: {type(df.loc[1, 'column_type'])}''')
			
 
				+
			
 
				+                    df['vec'] = df['name'] + '_' + df['column_type'] + '_' + df['remark']
			
 
				+                    # ############################### 表字段结束修改 ##############################
			
 
				+
			
 
				+
			
 
				+                    # if self.draw:
			
 
				+                    #     self.G.add_node(table_name)
			
 
				+
			
 
				+                    #     nodelist = [
			
 
				+                    #         (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
			
 
				+                    #     ]
			
 
				+                    #     self.G.add_nodes_from(nodelist)
			
 
				+
			
 
				+                    #     edgelist = [
			
 
				+                    #         (table_name, node[0]) for node in nodelist
			
 
				+                    #     ]
			
 
				+                    #     self.G.add_edges_from(edgelist)
			
 
				+
			
 
				+                    # 创建表节点
			
 
				+                    result = graph.nodes.match("表", name = table_name, c_name=table_c_name)
			
 
				+                    if len(result) > 0:
			
 
				+                        start_node = result.first()
			
 
				+                    else:
			
 
				+                        start_node = Node("表", name=table_name, c_name=table_c_name)
			
 
				+                        graph.create(start_node)
			
 
				+
			
 
				+
			
 
				+                    # 迭代表字段
			
 
				+                    for item in df.to_dict(orient ='records'):
			
 
				+
			
 
				+                        # 确保属性插入正常，删除非字符串键值
			
 
				+                        for key in set(item.keys()):
			
 
				+                            if not isinstance(key, str):
			
 
				+                                del item[key]
			
 
				+
			
 
				+                        ############################# 在此修改 ############################
			
 
				+                        # 字段名设置合并条件
			
 
				+                        name = item['name']
			
 
				+
			
 
				+                        if not item['vec']:
			
 
				+                            item['vec'] = table_name + '_' + name
			
 
				+                        ############################# 结束修改 ############################
			
 
				+
			
 
				+                        # 创建字段节点
			
 
				+                        result = graph.nodes.match("字段", vec = item['vec'])
			
 
				+
			
 
				+                        if len(result) > 0:
			
 
				+                            # 查询到相关字段，使用旧字段
			
 
				+                            end_node = result.first()
			
 
				+                            relationship = Relationship(start_node, "related", end_node, **{"name": item['name']})
			
 
				+                        else:
			
 
				+                            # 未查询到相关字段，创建节点
			
 
				+                            end_node = Node("字段", **item)
			
 
				+                            graph.create(end_node)
			
 
				+                            relationship = Relationship(start_node, "has", end_node)
			
 
				+
			
 
				+                        # 创建表字段关系
			
 
				+                        graph.merge(relationship)
			
 
				+                else:
			
 
				+                    print("非标准表格", table)
			
 
				+            else:
			
 
				+                print(r)
			
 
				+        print(self.all_tables.columns)
			
 
				+        print(self.all_tables)
			
 
				+
			
 
				+    def detect_type(self, text: str):
			
 
				+        fulltype = re.match(r'(\w+)\(\d+\)', text)
			
 
				+        if fulltype and (fulltype.group(1) in COLUMN_TYPES):
			
 
				+            return 'fulltype'
			
 
				+        elif text in COLUMN_TYPES:
			
 
				+            return 'type'
			
 
				+        else:
			
 
				+            return 'unknow'
			
 
				+    
			
 
				+    def get_table(self, raw_table):
			
 
				+        table = []
			
 
				+        dirty_table = []
			
 
				+        has_head = False
			
 
				+        for row in raw_table:
			
 
				+            if has_head:
			
 
				+                table.append(row)
			
 
				+            elif set(row) & set({**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **FOREIGN_KEY_DICT}.keys()):
			
 
				+                head = row
			
 
				+                has_head = True
			
 
				+            else:
			
 
				+                dirty_table.append(row)
			
 
				+
			
 
				+        # for row in raw_table:
			
 
				+        #     if get_head:
			
 
				+        #         table.append(row)
			
 
				+        #         continue
			
 
				+
			
 
				+        #     for col in row:
			
 
				+        #         fulltype = re.match(r'(\w+)\(\d+\)', col)
			
 
				+        #         if fulltype and (fulltype.group(1) in COLUMN_TYPES):
			
 
				+        #             table.append(row)
			
 
				+        #             get_head = True
			
 
				+        #             break
			
 
				+        #         elif col in COLUMN_TYPES:
			
 
				+        #             table.append(row)
			
 
				+        #             get_head = True
			
 
				+        #             break
			
 
				+        #     else:
			
 
				+        #         head = row
			
 
				+
			
 
				+        if table and (len(head) == len(table[0])) and (len(Counter([len(_) for _ in table]).keys()) == 1):
			
 
				+            table.insert(0, head)
			
 
				+            return table, dirty_table
			
 
				+        else:
			
 
				+            return None, dirty_table
			
 
				+
			
 
				+    def draw(self):
			
 
				+        if self.draw:
			
 
				+            nx.draw(self.G, with_labels = True)
			
 
				+            plt.show()
			
 
				+        else:
			
 
				+            return "Draw is not enabled"
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # path = '''data/数据库设计说明书.docx'''
			
 
				+    path = '''data/数据库设计文档.docx'''
			
 
				+    path = '''data/数据库设计(1).docx'''
			
 
				+    path = '''data/数据库设计(2).docx'''
			
 
				+    path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
			
 
				+    path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
			
 
				+    path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
			
 
				+    path = '''data/租房查询系统_数据库设计说明书_2.0.docx'''
			
 
				+    path = '''data/url-ukWkMKhnRgCvxVZt.docx'''
			
 
				+    path = '''data/url-qqp17mI32jTyozQt.docx'''
			
 
				+    path = '''data/电商-数据库详细设计说明书V0.4.docx'''
			
 
				+
			
 
				+    word = Word(path)
			
 
				+    word.predict()
			
 
				+
			
--- a/wordsim.py
+++ b/wordsim.py
@@ -0,0 +1,89 @@
 
				+# coding=utf-8
			
 
				+
			
 
				+import gzip
			
 
				+from gensim.models import Word2Vec
			
 
				+from gensim.test.utils import common_texts
			
 
				+
			
 
				+# sentences：我们要分析的语料
			
 
				+# size：詞向量的大小，默认值是100。
			
 
				+# window：考慮上下文各自的長度，默认值为 5。
			
 
				+# sg：即我们的word2vec两个模型的选择了。如果是0， 则是CBOW模型，是1则是Skip-Gram模型，默认是0即CBOW模型。
			
 
				+# hs：即我们的word2vec两个解法的选择了，如果是0， 则是Negative Sampling，是1的话并且负采样个数negative大于0， 则是Hierarchical Softmax。默认是0即Negative Sampling。
			
 
				+# negative：即使用Negative Sampling时负采样的个数，默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
			
 
				+# cbow_mean: 仅用于CBOW在做投影的时候，为0，则算法中的xw为上下文的词向量之和，为1则为上下文的词向量的平均值。在我们的原理篇中，是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
			
 
				+# min_count：單字至少出現的次數，workers：執行緒個數
			
 
				+# iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值。
			
 
				+# alpha: 在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η，默认是0.025。
			
 
				+# min_alpha: 由于算法支持在迭代的过程中逐渐减小步长，min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter，alpha， min_alpha一起得出。这部分由于不是word2vec算法的核心内容，因此在原理篇我们没有提到。对于大语料，需要对alpha, min_alpha,iter一起调参，来选择合适的三个值。
			
 
				+
			
 
				+model_simple = Word2Vec(sentences=common_texts, window=1,
			
 
				+                                      min_count=1, workers=4)
			
 
				+# 傳回 有效的字數及總處理字數
			
 
				+print(model_simple.train([["hello", "world", "michael"]], total_examples=1, epochs=2))
			
 
				+
			
 
				+sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
			
 
				+
			
 
				+model_simple = Word2Vec(min_count=1)
			
 
				+model_simple.build_vocab(sentences)  # 建立生字表(vocabulary)
			
 
				+print(model_simple.train(sentences, total_examples=model_simple.corpus_count
			
 
				+                         , epochs=model_simple.epochs))
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+# 載入 OpinRank 語料庫：關於車輛與旅館的評論
			
 
				+data_file="../nlp-in-practice-master/word2vec/reviews_data.txt.gz"
			
 
				+
			
 
				+
			
 
				+# 讀取 OpinRank 語料庫，並作前置處理
			
 
				+def read_input(input_file):
			
 
				+    with gzip.open (input_file, 'rb') as f:
			
 
				+        for i, line in enumerate (f):
			
 
				+            # 前置處理
			
 
				+            yield gensim.utils.simple_preprocess(line)
			
 
				+
			
 
				+# 載入 OpinRank 語料庫，分詞
			
 
				+documents = list(read_input(data_file))
			
 
				+# print(documents)
			
 
				+
			
 
				+
			
 
				+print(len(documents))
			
 
				+
			
 
				+# Word2Vec 模型訓練，約10分鐘
			
 
				+model = Word2Vec(documents,
			
 
				+            vector_size=150, window=10,
			
 
				+            min_count=2, workers=10)
			
 
				+print(model.train(documents, total_examples=len(documents), epochs=10))
			
 
				+
			
 
				+
			
 
				+# 測試『骯髒』相似詞
			
 
				+w1 = "dirty"
			
 
				+print(model.wv.most_similar(positive=w1))
			
 
				+# positive：相似詞
			
 
				+
			
 
				+
			
 
				+# 測試『禮貌』相似詞
			
 
				+w1 = ["polite"]
			
 
				+print(model.wv.most_similar(positive=w1, topn=6))
			
 
				+# topn：只列出前 n 名
			
 
				+
			
 
				+
			
 
				+# 測試『法國』相似詞
			
 
				+w1 = ["france"]
			
 
				+print(model.wv.most_similar(positive=w1, topn=6))
			
 
				+# topn：只列出前 n 名
			
 
				+
			
 
				+
			
 
				+# 測試『床、床單、枕頭』相似詞及『長椅』相反詞
			
 
				+w1 = ["bed",'sheet','pillow']
			
 
				+w2 = ['couch']
			
 
				+print(model.wv.most_similar(positive=w1, negative=w2, topn=10))
			
 
				+# negative：相反詞
			
 
				+
			
 
				+# 比較兩詞相似機率
			
 
				+print(model.wv.similarity(w1="dirty", w2="smelly"))
			
 
				+print(model.wv.similarity(w1="dirty", w2="dirty"))
			
 
				+print(model.wv.similarity(w1="dirty", w2="clean"))
			
 
				+
			
 
				+# 選出較不相似的字詞
			
 
				+print(model.wv.doesnt_match(["cat", "dog", "france"]))
			
 
				+"""
			
--- a/wordtable-24-05-10.py
+++ b/wordtable-24-05-10.py
@@ -0,0 +1,24 @@
 
				+import pandas as pd
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+
			
 
				+graph = Graph('http://192.168.1.202:7474/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+io = '''数据表结构.xlsx'''
			
 
				+
			
 
				+df = pd.read_excel(io, sheet_name='Sheet1', header=[0])
			
 
				+
			
 
				+df.字段描述.fillna(value='', inplace=True)
			
 
				+df['字段'] = df.字段.str.upper()
			
 
				+
			
 
				+for row in df.itertuples():
			
 
				+    try:
			
 
				+        start_node = Node("表", name=row.表, c_name=row.表名)
			
 
				+        end_node   = Node("列", name=row.字段, type=row.字段类型, detail=row.字段描述)
			
 
				+        relation   = Relationship(start_node, 'has', end_node)
			
 
				+        graph.merge(start_node, "表", "name")
			
 
				+        graph.merge(end_node, "列", "name")
			
 
				+        graph.merge(relation, "值", "名称")
			
 
				+    except:
			
 
				+        print(row)
			
--- a/wordtable-24-05-12.py
+++ b/wordtable-24-05-12.py
@@ -0,0 +1,185 @@
 
				+"""
			
 
				+对 Word 中数据库设计表构建图谱
			
 
				+
			
 
				+字段名判断依据：1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
			
 
				+"""
			
 
				+import re
			
 
				+import logging
			
 
				+
			
 
				+import docx
			
 
				+from docx import Document
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				+from docx.table import _Cell, Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				+
			
 
				+import uuid
			
 
				+import pandas as pd
			
 
				+
			
 
				+import networkx as nx
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+
			
 
				+coltypes = {'int', 'bigint', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
			
 
				+
			
 
				+coldict = {'名称': 'colname', '字段名': 'colname', 'Field Name': 'colname', '关联字段': 'alias', 'Alias': 'alias'}
			
 
				+
			
 
				+
			
 
				+class LoggerHandler(logging.Logger):
			
 
				+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
			
 
				+        super().__init__(name)
			
 
				+        self.setLevel(logging.INFO)
			
 
				+        self.fmt = logging.Formatter(fmt)
			
 
				+        self.set_console_handler(console_handler_level)
			
 
				+
			
 
				+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
			
 
				+        ch = logging.StreamHandler()
			
 
				+        ch.setLevel(console_handler_level)
			
 
				+        ch.setFormatter(self.fmt)
			
 
				+        self.addHandler(ch)
			
 
				+
			
 
				+
			
 
				+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
			
 
				+
			
 
				+
			
 
				+class Word:
			
 
				+    def __init__(self, path: str, draw: bool = False) -> None:
			
 
				+        self.draw = draw
			
 
				+        self.doc = Document(path)
			
 
				+        if draw:
			
 
				+            self.G = nx.Graph()
			
 
				+
			
 
				+    def iter_block_item(self, parent):
			
 
				+        if isinstance(parent, docx.document.Document):
			
 
				+            parent_elm = parent.element.body
			
 
				+        elif isinstance(parent, _Cell):
			
 
				+            parent_elm = parent._tc
			
 
				+        else:
			
 
				+            raise ValueError("something error")
			
 
				+
			
 
				+        for child in parent_elm.iterchildren():
			
 
				+            if isinstance(child, CT_P):
			
 
				+                yield Paragraph(child, parent)
			
 
				+            elif isinstance(child, CT_Tbl):
			
 
				+                yield Table(child, parent)
			
 
				+
			
 
				+    def parse(self) -> tuple:
			
 
				+        for block in self.iter_block_item(self.doc):
			
 
				+            if block.style.name == 'Heading 1' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 2' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 3' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 4' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 5' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 6' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Normal' and block.text:
			
 
				+                yield ('Normal', block.text)
			
 
				+            elif block.style.name == 'Table Grid':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.style.name == 'Normal Table':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+
			
 
				+    def predict(self):
			
 
				+        for r in self.parse():
			
 
				+            if r[0] in ['Heading', 'Normal']:
			
 
				+                tablename = r[1]
			
 
				+                logger.debug(tablename)
			
 
				+            if r[0] == 'Table':
			
 
				+                # 判断表是否为需要解析的表
			
 
				+                if any(coltypes & {i.lower() for i in r[1][1]}):
			
 
				+                    # 数据库表名解析
			
 
				+                    if re.search("[a-zA-Z_]+", tablename):
			
 
				+                        tablename = re.search("[a-zA-Z_]+", tablename).group()
			
 
				+                    logger.info(f"得到数据库表，表名：{tablename}")
			
 
				+
			
 
				+                    df = pd.DataFrame(r[1])
			
 
				+                    df.columns = df.values.tolist()[0]
			
 
				+                    df.drop([0], inplace=True)
			
 
				+
			
 
				+                    if self.draw:
			
 
				+                        self.G.add_node(tablename)
			
 
				+
			
 
				+                        nodelist = [
			
 
				+                            (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
			
 
				+                        ]
			
 
				+                        self.G.add_nodes_from(nodelist)
			
 
				+
			
 
				+                        edgelist = [
			
 
				+                            (tablename, node[0]) for node in nodelist
			
 
				+                        ]
			
 
				+                        self.G.add_edges_from(edgelist)
			
 
				+
			
 
				+                    # 创建表节点
			
 
				+                    start_node = Node("表", name=tablename)
			
 
				+                    graph.merge(start_node, "表", "name")
			
 
				+
			
 
				+                    # 表字段修正
			
 
				+                    df.rename(columns=coldict, inplace=True)
			
 
				+                    # 别名字段必填
			
 
				+                    if 'alias' not in df.columns:
			
 
				+                        logger.warning(f"未找到Alias字段，当前字段包含：{df.columns}")
			
 
				+                        df['alias'] = ''
			
 
				+
			
 
				+                    # 迭代表字段
			
 
				+                    for item in df.to_dict(orient ='records'):
			
 
				+
			
 
				+                        # 确保属性插入正常
			
 
				+                        for key in set(item.keys()):
			
 
				+                            if not isinstance(key, str):
			
 
				+                                del item[key]
			
 
				+                        
			
 
				+                        # 字段名设置合并条件
			
 
				+                        colname = item['colname']
			
 
				+                        if not item['alias']:
			
 
				+                            item['alias'] = tablename + '_' + colname
			
 
				+
			
 
				+                        # 创建字段节点
			
 
				+                        end_node = Node("字段", name=colname, **item)
			
 
				+                        graph.merge(end_node, "字段", "alias")
			
 
				+
			
 
				+                        # 创建表字段关系
			
 
				+                        # relation = Relationship(start_node, 'has', end_node)
			
 
				+                        HAS = Relationship.type("has")
			
 
				+                        graph.merge(HAS(start_node, end_node))
			
 
				+
			
 
				+    def draw(self):
			
 
				+        if self.draw:
			
 
				+            nx.draw(self.G, with_labels = True)
			
 
				+            plt.show()
			
 
				+        else:
			
 
				+            return "Draw is not enabled"
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    path = '''数据库设计文档.docx'''
			
 
				+    path = '''数据库设计(1).docx'''
			
 
				+    path = '''数据库设计(2).docx'''
			
 
				+    # path = '''国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
			
 
				+    # path = '''FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
			
 
				+    # path = '''中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
			
 
				+
			
 
				+    word = Word(path)
			
 
				+    word.predict()
			
--- a/wordtable-24-05-14.py
+++ b/wordtable-24-05-14.py
@@ -0,0 +1,211 @@
 
				+"""
			
 
				+对 Word 中数据库设计表构建图谱
			
 
				+
			
 
				+字段名判断依据：1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
			
 
				+
			
 
				+字段类型判断：re.match(r'\w+\(\d+\)', s)
			
 
				+"""
			
 
				+import re
			
 
				+import logging
			
 
				+
			
 
				+import docx
			
 
				+from docx import Document
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				+from docx.table import _Cell, Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				+
			
 
				+import uuid
			
 
				+import pandas as pd
			
 
				+
			
 
				+import networkx as nx
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+
			
 
				+# 列类型判断
			
 
				+with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
			
 
				+    coltypes = {i.strip() for i in fp.readlines()}
			
 
				+# 列名判断
			
 
				+with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
			
 
				+    coldict = {i.strip(): 'colname' for i in fp.readlines()}
			
 
				+# 注释判断
			
 
				+with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
			
 
				+    aliasdict = {i.strip(): 'alias' for i in fp.readlines()}
			
 
				+
			
 
				+
			
 
				+class LoggerHandler(logging.Logger):
			
 
				+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
			
 
				+        super().__init__(name)
			
 
				+        self.setLevel(logging.INFO)
			
 
				+        self.fmt = logging.Formatter(fmt)
			
 
				+        self.set_console_handler(console_handler_level)
			
 
				+
			
 
				+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
			
 
				+        ch = logging.StreamHandler()
			
 
				+        ch.setLevel(console_handler_level)
			
 
				+        ch.setFormatter(self.fmt)
			
 
				+        self.addHandler(ch)
			
 
				+
			
 
				+
			
 
				+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
			
 
				+
			
 
				+
			
 
				+class Word:
			
 
				+    def __init__(self, path: str, draw: bool = False) -> None:
			
 
				+        self.draw = draw
			
 
				+        self.doc = Document(path)
			
 
				+        if draw:
			
 
				+            self.G = nx.Graph()
			
 
				+
			
 
				+    def iter_block_item(self, parent):
			
 
				+        if isinstance(parent, docx.document.Document):
			
 
				+            parent_elm = parent.element.body
			
 
				+        elif isinstance(parent, _Cell):
			
 
				+            parent_elm = parent._tc
			
 
				+        else:
			
 
				+            raise ValueError("something error")
			
 
				+
			
 
				+        for child in parent_elm.iterchildren():
			
 
				+            if isinstance(child, CT_P):
			
 
				+                yield Paragraph(child, parent)
			
 
				+            elif isinstance(child, CT_Tbl):
			
 
				+                yield Table(child, parent)
			
 
				+
			
 
				+    def parse(self) -> tuple:
			
 
				+        for block in self.iter_block_item(self.doc):
			
 
				+            if block.style.name == 'Heading 1' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 2' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 3' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 4' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 5' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Heading 6' and block.text:
			
 
				+                yield ('Heading', block.text)
			
 
				+            elif block.style.name == 'Normal' and block.text:
			
 
				+                yield ('Normal', block.text)
			
 
				+            elif block.style.name == 'Table Grid':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.style.name == 'Normal Table':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+
			
 
				+    def predict(self):
			
 
				+        for r in self.parse():
			
 
				+            if r[0] in ['Heading', 'Normal']:
			
 
				+                tablename = r[1]
			
 
				+                logger.debug(tablename)
			
 
				+            elif r[0] == 'Table':
			
 
				+                # 判断表是否为需要解析的表
			
 
				+                if any(coltypes & {i.lower() for i in r[1][1]}):
			
 
				+                    # 数据库表名解析
			
 
				+                    if re.search("[a-zA-Z_\d]+", tablename):
			
 
				+                        tablename = re.search("[a-zA-Z_]+", tablename).group()
			
 
				+                        logger.info(f"得到数据库表，表名：{tablename}")
			
 
				+
			
 
				+                    ############################# 在此修改 ############################
			
 
				+                    df = pd.DataFrame(r[1])
			
 
				+                    df.columns = df.values.tolist()[0]
			
 
				+                    df.drop([0], inplace=True)
			
 
				+                    ############################# 结束修改 ############################
			
 
				+
			
 
				+                    if self.draw:
			
 
				+                        self.G.add_node(tablename)
			
 
				+
			
 
				+                        nodelist = [
			
 
				+                            (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
			
 
				+                        ]
			
 
				+                        self.G.add_nodes_from(nodelist)
			
 
				+
			
 
				+                        edgelist = [
			
 
				+                            (tablename, node[0]) for node in nodelist
			
 
				+                        ]
			
 
				+                        self.G.add_edges_from(edgelist)
			
 
				+
			
 
				+                    # 创建表节点
			
 
				+                    start_node = Node("表", name=tablename)
			
 
				+                    graph.create(start_node)
			
 
				+
			
 
				+                    ############################### 在此修改 ##############################
			
 
				+                    # 表字段修正
			
 
				+                    df.rename(columns=coldict, inplace=True)
			
 
				+                    df.rename(columns=aliasdict, inplace=True)
			
 
				+                    df['tablename'] = tablename
			
 
				+                    df['fullname'] = df['tablename'] + '_' + df['colname']
			
 
				+                    ############################### 结束修改 ##############################
			
 
				+
			
 
				+                    # 别名字段必填
			
 
				+                    if 'alias' not in df.columns:
			
 
				+                        logger.warning(f"未找到注释字段，当前字段包含：{df.columns}")
			
 
				+                        df['alias'] = ''
			
 
				+
			
 
				+                    # 迭代表字段
			
 
				+                    for item in df.to_dict(orient ='records'):
			
 
				+
			
 
				+                        # 确保属性插入正常，删除非字符串键值
			
 
				+                        for key in set(item.keys()):
			
 
				+                            if not isinstance(key, str):
			
 
				+                                del item[key]
			
 
				+
			
 
				+                        ############################# 在此修改 ############################
			
 
				+                        # 字段名设置合并条件
			
 
				+                        colname = item['colname']
			
 
				+
			
 
				+                        if not item['alias']:
			
 
				+                            item['alias'] = tablename + '_' + colname
			
 
				+                        ############################# 结束修改 ############################
			
 
				+
			
 
				+                        # 创建字段节点
			
 
				+                        result = graph.nodes.match("字段", alias = item['alias'])
			
 
				+
			
 
				+                        if len(result) > 0:
			
 
				+                            # 查询到相关字段，使用旧字段
			
 
				+                            end_node = result.first()
			
 
				+                            relationship = Relationship(start_node, "foreignkey", end_node, **{"name": item['colname']})
			
 
				+                        else:
			
 
				+                            # 未查询到相关字段，创建节点
			
 
				+                            end_node = Node("字段", name=colname, **item)
			
 
				+                            graph.create(end_node)
			
 
				+                            relationship = Relationship(start_node, "has", end_node)
			
 
				+
			
 
				+                        # 创建表字段关系
			
 
				+                        graph.merge(relationship)
			
 
				+
			
 
				+    def draw(self):
			
 
				+        if self.draw:
			
 
				+            nx.draw(self.G, with_labels = True)
			
 
				+            plt.show()
			
 
				+        else:
			
 
				+            return "Draw is not enabled"
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    path = '''data/数据库设计文档.docx'''
			
 
				+    path = '''data/数据库设计(1).docx'''
			
 
				+    # path = '''data/数据库设计(2).docx'''
			
 
				+    # path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
			
 
				+    # path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
			
 
				+    # path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
			
 
				+
			
 
				+    word = Word(path)
			
 
				+    word.predict()
			
--- a/wordtable-24-05-17.py
+++ b/wordtable-24-05-17.py
@@ -0,0 +1,394 @@
 
				+"""
			
 
				+对 Word 中数据库设计表构建图谱
			
 
				+
			
 
				+字段名判断依据：1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
			
 
				+
			
 
				+字段类型判断：re.match(r'\w+\(\d+\)', s)
			
 
				+
			
 
				+1、判断是否标准表 [依据]-> 判断是否包含列类型 -> 找到列
			
 
				+2、判断列名 [是]-> 判断是否符合列名规范
			
 
				+3、判断列中文名 [规则]-> 找到列中文名 
			
 
				+4、判断列类型 [是]-> 判断是否符合列规范
			
 
				+5、判断主键 [未找到主键]-> 判断是否是ID
			
 
				+6、判断是否必填字段 [依据]-> 分类[是否为空|是否必填] -> [未找到字段]-> 设定默认值必填
			
 
				+7、判断列注释 [未找到注释]-> 中文名 [未找到中文名]-> 表名+列名 -> [列类型为bool、时间、流等]-> 添加UUID
			
 
				+8、字段唯一判断条件 <列名,列类型,注释>
			
 
				+"""
			
 
				+import re
			
 
				+import uuid
			
 
				+import logging
			
 
				+from collections import Counter
			
 
				+
			
 
				+import docx
			
 
				+from docx import Document
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				+from docx.table import _Cell, Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				+import jieba
			
 
				+import pandas as pd
			
 
				+
			
 
				+import networkx as nx
			
 
				+import matplotlib.pyplot as plt
			
 
				+
			
 
				+# from text2vec import Similarity
			
 
				+# sim = Similarity()
			
 
				+
			
 
				+from py2neo import Node, Graph, Relationship
			
 
				+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
			
 
				+graph.delete_all()
			
 
				+
			
 
				+
			
 
				+# # 列类型判断
			
 
				+# with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     COLUMN_TYPES = {i.strip() for i in fp.readlines()}
			
 
				+# # 列名判断
			
 
				+# with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     COLUMN_DICT = {i.strip(): 'name' for i in fp.readlines()}
			
 
				+# # 注释判断
			
 
				+# with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
			
 
				+#     REMARK_DICT = {i.strip(): 'remark' for i in fp.readlines()}
			
 
				+
			
 
				+
			
 
				+COLUMN_TYPES = {'int', 'bigint', 'tinyint', 'smallint', 'bigint unsigned', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
			
 
				+
			
 
				+TYPE_DICT = {'类型': 'column_type', '数据类型': 'column_type'}
			
 
				+
			
 
				+COLUMN_DICT = {'名称': 'name', '字段名': 'name', 'field name': 'name', '字段代码': 'name', '代码': 'name', '物理字段名': 'name'}
			
 
				+
			
 
				+C_NAME_DICT = {'字段中文名': 'c_name', '中文含义': 'c_name', '名字': 'c_name', '字段名称': 'c_name', '逻辑字段名': 'c_name'}
			
 
				+
			
 
				+REMARK_DICT = {'Alias': 'remark', 'description': 'remark', '说明': 'remark', '描述': 'remark', '备注': 'remark'}
			
 
				+
			
 
				+REQUIRED_DICT = {'空/非空': 'required', '可不可以为空': 'required', '是否为空': 'required', '允许空值': 'required', '是否必填': 'required', '空值': 'required'}
			
 
				+
			
 
				+PRIMARY_KEY_DICT = {'主键': 'primary_key'}
			
 
				+
			
 
				+FOREIGN_KEY_DICT = {'外键': 'foreign_key'}
			
 
				+
			
 
				+
			
 
				+class LoggerHandler(logging.Logger):
			
 
				+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
			
 
				+        super().__init__(name)
			
 
				+        self.setLevel(logging.INFO)
			
 
				+        self.fmt = logging.Formatter(fmt)
			
 
				+        self.set_console_handler(console_handler_level)
			
 
				+
			
 
				+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
			
 
				+        ch = logging.StreamHandler()
			
 
				+        ch.setLevel(console_handler_level)
			
 
				+        ch.setFormatter(self.fmt)
			
 
				+        self.addHandler(ch)
			
 
				+
			
 
				+
			
 
				+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
			
 
				+
			
 
				+
			
 
				+class Word:
			
 
				+    def __init__(self, path: str, draw: bool = False) -> None:
			
 
				+        self.draw = draw
			
 
				+        self.doc = Document(path)
			
 
				+        if draw:
			
 
				+            self.G = nx.Graph()
			
 
				+        self.namecount = dict({})
			
 
				+        self.all_tables = pd.DataFrame()
			
 
				+
			
 
				+    def iter_block_item(self, parent):
			
 
				+        if isinstance(parent, docx.document.Document):
			
 
				+            parent_elm = parent.element.body
			
 
				+        elif isinstance(parent, _Cell):
			
 
				+            parent_elm = parent._tc
			
 
				+        else:
			
 
				+            raise ValueError("something error")
			
 
				+
			
 
				+        for child in parent_elm.iterchildren():
			
 
				+            if isinstance(child, CT_P):
			
 
				+                yield Paragraph(child, parent)
			
 
				+            elif isinstance(child, CT_Tbl):
			
 
				+                yield Table(child, parent)
			
 
				+
			
 
				+    def parse(self) -> tuple:
			
 
				+        for block in self.iter_block_item(self.doc):
			
 
				+            if block.style.name == 'Heading 1' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 2' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 3' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 4' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 5' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Heading 6' and block.text:
			
 
				+                yield ('Heading', block.text.lower())
			
 
				+            elif block.style.name == 'Normal' and block.text:
			
 
				+                yield ('Normal', block.text.lower())
			
 
				+            elif block.style.name == 'Table Grid':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip().lower())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.style.name == 'Normal Table':
			
 
				+                tables = []
			
 
				+                for row in block.rows:
			
 
				+                    rows = []
			
 
				+                    for cell in row.cells:
			
 
				+                        for paragraph in cell.paragraphs:
			
 
				+                            rows.append(paragraph.text.strip().lower())
			
 
				+                    tables.append(rows)
			
 
				+                yield ('Table', tables)
			
 
				+            elif block.text:
			
 
				+                yield ('Unknow', block)
			
 
				+
			
 
				+    # def clean_table(self, raw_table):
			
 
				+    #     table = []
			
 
				+    #     dirty_table = []
			
 
				+    #     while raw_table and '' in raw_table[0]:
			
 
				+    #         raw_table = raw_table[1:]
			
 
				+
			
 
				+    #     # 表格预处理
			
 
				+    #     rowslen = [len(row) for row in raw_table]
			
 
				+
			
 
				+    #     if not rowslen:
			
 
				+    #         return None
			
 
				+
			
 
				+    #     rowlen = Counter(rowslen).most_common(1)[0][0]
			
 
				+    #     for i,l in enumerate(rowslen):
			
 
				+    #         if l == rowlen:
			
 
				+    #             table.append(raw_table[i])
			
 
				+    #         else:
			
 
				+    #             dirty_table.append(raw_table[i])
			
 
				+    #     return table, dirty_table
			
 
				+
			
 
				+    def predict(self):
			
 
				+        for r in self.parse():
			
 
				+            if r[0] in ['Heading', 'Normal'] and r[1]:
			
 
				+                tablename = r[1]
			
 
				+                logger.debug(tablename)
			
 
				+            elif r[0] == 'Table':
			
 
				+
			
 
				+                # table = r[1]
			
 
				+
			
 
				+                # table, dirty_table = self.clean_table(r[1])
			
 
				+                table, dirty_table = self.get_table(r[1])
			
 
				+
			
 
				+                if not table:
			
 
				+                    continue
			
 
				+
			
 
				+                # 判断表是否为需要解析的表
			
 
				+                if any({'fulltype', 'type'} & {self.detect_type(i) for i in table[1]}):
			
 
				+
			
 
				+                    ############################### 数据库表名解析 ##############################
			
 
				+                    if re.search("[a-zA-Z_\d]+", tablename):
			
 
				+                        table_name = re.search("[a-zA-Z_]+", tablename).group()
			
 
				+                        try:
			
 
				+                            table_c_name = re.search('[\u4e00-\u9fa5]{3,}', tablename).group()
			
 
				+                        except Exception as e:
			
 
				+                            table_c_name = "未知表"
			
 
				+                        logger.info(f"得到数据库表，表名：{table_name}\t猜测中文表名：{table_c_name}")
			
 
				+                    else:
			
 
				+                        table_name = "UnknowTable"
			
 
				+                        table_c_name = "未知表"
			
 
				+                    ############################### 表名解析结束 ###############################
			
 
				+
			
 
				+
			
 
				+                    ############################# 表字段在此修改 ############################
			
 
				+                    df = pd.DataFrame(table)
			
 
				+                    df.columns = df.values.tolist()[0]
			
 
				+                    df.drop([0], inplace=True)
			
 
				+
			
 
				+                    # # 表字段修正
			
 
				+                    df.rename(columns={**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **PRIMARY_KEY_DICT, **FOREIGN_KEY_DICT}, inplace=True)
			
 
				+
			
 
				+                    df['table_name'] = table_name
			
 
				+                    df['table_c_name'] = table_c_name
			
 
				+
			
 
				+                    for i in df.columns:
			
 
				+                        if self.detect_type(df.loc[1, i]) != 'unknow':
			
 
				+                            df.rename(columns={i: 'column_type'}, inplace=True)
			
 
				+                            break
			
 
				+
			
 
				+                    # 判断字段是否允许为空，允许为空值的不可连接
			
 
				+                    # for i in df.columns:
			
 
				+                    #     if sim.get_score(i, '允许值为空') > 0.7:
			
 
				+                    #         df['unique'] = df[i].apply(lambda x: str(uuid.uuid1()) if x != 'n' else '')
			
 
				+                    #         break
			
 
				+                    #     elif sim.get_score(i, '必填') > 0.7:
			
 
				+                    #         df['unique'] = df[i].apply(lambda x: '' if x != 'n' else str(uuid.uuid1()))
			
 
				+                    #         break
			
 
				+                    # if 'unique' not in df.columns:
			
 
				+                    #     print("无法判断字段是否必填")
			
 
				+                    #     df['unique'] = ''
			
 
				+
			
 
				+                    ############################### 必填字段判断 ###############################
			
 
				+                    if 'required' not in df.columns:
			
 
				+                        logger.warning("无法判断字段是否必填，设定默认值必填，所有字段皆可关联！")
			
 
				+                        df['required'] = ''
			
 
				+                    ############################### 必填字段判断 ###############################
			
 
				+
			
 
				+                    # 注释字段必填
			
 
				+                    if 'remark' not in df.columns:
			
 
				+                        if 'c_name' not in df.columns:
			
 
				+                            logger.warning(f"未找到注释字段，当前字段包含：{df.columns}")
			
 
				+                            df['remark'] = ''
			
 
				+                        else:
			
 
				+                            logger.warning(f"未找到注释字段，使用字段中文名代替！，当前字段包含：{df.columns}")
			
 
				+                            df['remark'] = df['c_name']
			
 
				+
			
 
				+                    ############################### 指定字段不可关联 ###############################
			
 
				+                    df['remark'] = df['remark'] + df['column_type'].apply(lambda x: str(uuid.uuid1()) if x in ['date', 'datetime', 'blob', 'text'] else '')
			
 
				+                    df['remark'] = df['remark'] + df['name'].apply(lambda x: str(uuid.uuid1()) if x in ['create_time', 'update_time'] else '')
			
 
				+                    df['remark'] = df.apply(lambda x: x['name'] if not x['remark'] else x['remark'], axis=1)
			
 
				+                    ############################### 指定字段不可关联 ###############################
			
 
				+
			
 
				+                    ############################### 为空字段不可关联 ###############################
			
 
				+                    pass
			
 
				+                    ############################### 为空字段不可关联 ###############################
			
 
				+
			
 
				+
			
 
				+                    if not df.query(' name in ["id", "ID", "Id"] ').empty:
			
 
				+                        idx = df.query(' name in ["id", "ID", "Id"] ').index[0]
			
 
				+                        df.loc[idx, 'remark'] = ''.join([table_name, '_id'])
			
 
				+
			
 
				+                    # 判断唯一的依据
			
 
				+                    logger.debug(f'''remark type: {type(df.loc[1, 'remark'])}''')
			
 
				+                    logger.debug(f'''column_type type: {type(df.loc[1, 'column_type'])}''')
			
 
				+
			
 
				+                    df['vec'] = df['name'] + '_' + df['column_type'] + '_' + df['remark']
			
 
				+                    # ############################### 表字段结束修改 ##############################
			
 
				+
			
 
				+
			
 
				+                    # if self.draw:
			
 
				+                    #     self.G.add_node(table_name)
			
 
				+
			
 
				+                    #     nodelist = [
			
 
				+                    #         (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
			
 
				+                    #     ]
			
 
				+                    #     self.G.add_nodes_from(nodelist)
			
 
				+
			
 
				+                    #     edgelist = [
			
 
				+                    #         (table_name, node[0]) for node in nodelist
			
 
				+                    #     ]
			
 
				+                    #     self.G.add_edges_from(edgelist)
			
 
				+
			
 
				+                    # 创建表节点
			
 
				+                    result = graph.nodes.match("表", name = table_name, c_name=table_c_name)
			
 
				+                    if len(result) > 0:
			
 
				+                        start_node = result.first()
			
 
				+                    else:
			
 
				+                        start_node = Node("表", name=table_name, c_name=table_c_name)
			
 
				+                        graph.create(start_node)
			
 
				+
			
 
				+
			
 
				+                    # 迭代表字段
			
 
				+                    for item in df.to_dict(orient ='records'):
			
 
				+
			
 
				+                        # 确保属性插入正常，删除非字符串键值
			
 
				+                        for key in set(item.keys()):
			
 
				+                            if not isinstance(key, str):
			
 
				+                                del item[key]
			
 
				+
			
 
				+                        ############################# 在此修改 ############################
			
 
				+                        # 字段名设置合并条件
			
 
				+                        name = item['name']
			
 
				+
			
 
				+                        if not item['vec']:
			
 
				+                            item['vec'] = table_name + '_' + name
			
 
				+                        ############################# 结束修改 ############################
			
 
				+
			
 
				+                        # 创建字段节点
			
 
				+                        result = graph.nodes.match("字段", vec = item['vec'])
			
 
				+
			
 
				+                        if len(result) > 0:
			
 
				+                            # 查询到相关字段，使用旧字段
			
 
				+                            end_node = result.first()
			
 
				+                            relationship = Relationship(start_node, "related", end_node, **{"name": item['name']})
			
 
				+                        else:
			
 
				+                            # 未查询到相关字段，创建节点
			
 
				+                            end_node = Node("字段", **item)
			
 
				+                            graph.create(end_node)
			
 
				+                            relationship = Relationship(start_node, "has", end_node)
			
 
				+
			
 
				+                        # 创建表字段关系
			
 
				+                        graph.merge(relationship)
			
 
				+                else:
			
 
				+                    print("非标准表格", table)
			
 
				+            else:
			
 
				+                print(r)
			
 
				+        print(self.all_tables.columns)
			
 
				+        print(self.all_tables)
			
 
				+
			
 
				+    def detect_type(self, text: str):
			
 
				+        fulltype = re.match(r'(\w+)\(\d+\)', text)
			
 
				+        if fulltype and (fulltype.group(1) in COLUMN_TYPES):
			
 
				+            return 'fulltype'
			
 
				+        elif text in COLUMN_TYPES:
			
 
				+            return 'type'
			
 
				+        else:
			
 
				+            return 'unknow'
			
 
				+    
			
 
				+    def get_table(self, raw_table):
			
 
				+        table = []
			
 
				+        dirty_table = []
			
 
				+        has_head = False
			
 
				+        for row in raw_table:
			
 
				+            if has_head:
			
 
				+                table.append(row)
			
 
				+            elif set(row) & set({**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **FOREIGN_KEY_DICT}.keys()):
			
 
				+                head = row
			
 
				+                has_head = True
			
 
				+            else:
			
 
				+                dirty_table.append(row)
			
 
				+
			
 
				+        # for row in raw_table:
			
 
				+        #     if get_head:
			
 
				+        #         table.append(row)
			
 
				+        #         continue
			
 
				+
			
 
				+        #     for col in row:
			
 
				+        #         fulltype = re.match(r'(\w+)\(\d+\)', col)
			
 
				+        #         if fulltype and (fulltype.group(1) in COLUMN_TYPES):
			
 
				+        #             table.append(row)
			
 
				+        #             get_head = True
			
 
				+        #             break
			
 
				+        #         elif col in COLUMN_TYPES:
			
 
				+        #             table.append(row)
			
 
				+        #             get_head = True
			
 
				+        #             break
			
 
				+        #     else:
			
 
				+        #         head = row
			
 
				+
			
 
				+        if table and (len(head) == len(table[0])) and (len(Counter([len(_) for _ in table]).keys()) == 1):
			
 
				+            table.insert(0, head)
			
 
				+            return table, dirty_table
			
 
				+        else:
			
 
				+            return None, dirty_table
			
 
				+
			
 
				+    def draw(self):
			
 
				+        if self.draw:
			
 
				+            nx.draw(self.G, with_labels = True)
			
 
				+            plt.show()
			
 
				+        else:
			
 
				+            return "Draw is not enabled"
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # path = '''data/数据库设计说明书.docx'''
			
 
				+    path = '''data/数据库设计文档.docx'''
			
 
				+    path = '''data/数据库设计(1).docx'''
			
 
				+    path = '''data/数据库设计(2).docx'''
			
 
				+    path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
			
 
				+    path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
			
 
				+    path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
			
 
				+    path = '''data/租房查询系统_数据库设计说明书_2.0.docx'''
			
 
				+    path = '''data/url-ukWkMKhnRgCvxVZt.docx'''
			
 
				+    path = '''data/url-qqp17mI32jTyozQt.docx'''
			
 
				+    path = '''data/电商-数据库详细设计说明书V0.4.docx'''
			
 
				+
			
 
				+    word = Word(path)
			
 
				+    word.predict()
			
 
				+