sprivacy vor 1 Jahr
Ursprung
Commit
b2543f9c1c

+ 63 - 0
README.md

@@ -0,0 +1,63 @@
+# SQLKnowledgeGraph
+从无到有搭建一个以数据库设计文档为中心的知识图谱,并以该知识图谱完成自动问答与分析服务。
+
+
+
+
+
+
+table_1:
+
+| 列名 | 备注 |
+|---|---|
+| id | ID |
+| name | 用户名 |
+| create_time | 创建时间 |
+
+table_2:
+
+| 列名 | 备注 |
+|---|---|
+| id | ID |
+| file_id | 文件ID |
+| create_by | 创建者 |
+| create_time | 创建时间 |
+
+table_3:
+
+| 列名 | 备注 |
+|---|---|
+| id | ID |
+| file | 文件名 |
+| create_time | 创建时间 |
+
+
+
+字段类型完全相同的情况下:
+
+一、通过列名+表名向量的问题:
+
+1、表1、表2 和 表3 具有相同 id 字段,但应判断无关  
+2、表1、表2 和 表3 具有相同 create_time 字段,但应判断无关  
+3、表3 的 id 字段 和 表2 file_id 字段应为同一字段,因判断为外键关系  
+4、表1 的 id 字段 和 表2 的 update_by 字段应为同一字段,因判断为外键关系  
+
+二、通过表名 + 备注向量的问题:
+
+1、表1、表2 和 表3 具有相同 id 字段,但应判断无关  
+2、表1、表2 和 表3 具有相同 create_time 字段,但应判断无关  
+3、表1 的 id 字段 和 表2 的 update_by 字段应为同一字段,因判断为外键关系,向量化比较时受 表3 file 字段影响
+
+三、通过 <表名, 表中文名, 列名, 备注> 向量:
+
+
+
+
+字段转换
+
+| 表名      | 表中文名    | 列名     | 列类型  | 列详细信息 | 列唯一表示 |
+| ---       | ---        | ---     | ---     | ---       | ---       |
+| tablename | tablecname | colname | coltype | detail    | vec       |
+| name      | c_name     | name    | type    | detail    | vec       |
+
+

BIN
data/FJS-OCR 富士通识别平台 数据库设计说明书.docx


BIN
data/url-qqp17mI32jTyozQt.docx


BIN
data/url-ukWkMKhnRgCvxVZt.docx


BIN
data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx


BIN
data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx


BIN
data/数据库设计(1).docx


BIN
data/数据库设计(2).docx


BIN
data/数据库设计文档.docx


BIN
data/数据库设计说明书.docx


BIN
data/电商-数据库详细设计说明书V0.4.docx


BIN
data/租房查询系统_数据库设计说明书_2.0.docx


+ 5 - 0
dict/columnname.txt

@@ -0,0 +1,5 @@
+名称
+字段名
+Field Name
+字段代码
+代码

+ 14 - 0
dict/columntype.txt

@@ -0,0 +1,14 @@
+int
+bigint
+float
+double
+decimal
+date
+datetime
+char
+varchar
+text
+longtext
+blob
+bool
+boolean

+ 5 - 0
dict/comment.txt

@@ -0,0 +1,5 @@
+Alias
+关联字段
+Description
+说明
+描述

+ 24 - 0
excel2sql.py

@@ -0,0 +1,24 @@
+import pandas as pd
+
+from py2neo import Node, Graph, Relationship
+
+graph = Graph('http://192.168.1.202:7474/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+io = '''数据表结构.xlsx'''
+
+df = pd.read_excel(io, sheet_name='Sheet1', header=[0])
+
+df.字段描述.fillna(value='', inplace=True)
+df['字段'] = df.字段.str.upper()
+
+for row in df.itertuples():
+    try:
+        start_node = Node("表", name=row.表, c_name=row.表名)
+        end_node   = Node("列", name=row.字段, type=row.字段类型, detail=row.字段描述)
+        relation   = Relationship(start_node, 'has', end_node)
+        graph.merge(start_node, "表", "name")
+        graph.merge(end_node, "列", "name")
+        graph.merge(relation, "值", "名称")
+    except:
+        print(row)

+ 168 - 0
jiebasim.py

@@ -0,0 +1,168 @@
+# import jieba
+# import numpy as np
+# import re
+ 
+# def get_word_vector(s1, s2):
+#     """
+#     :param s1: 句子1
+#     :param s2: 句子2
+#     :return: 返回句子的余弦相似度
+#     """
+#     # 分词
+#     cut1 = jieba.cut(s1)
+#     cut2 = jieba.cut(s2)
+#     list_word1 = (','.join(cut1)).split(',')
+#     list_word2 = (','.join(cut2)).split(',')
+
+#     # 列出所有的词,取并集
+#     key_word = list(set(list_word1 + list_word2))
+#     # 给定形状和类型的用0填充的矩阵存储向量
+#     word_vector1 = np.zeros(len(key_word))
+#     word_vector2 = np.zeros(len(key_word))
+
+#     # 计算词频
+#     # 依次确定向量的每个位置的值
+#     for i in range(len(key_word)):
+#         # 遍历key_word中每个词在句子中的出现次数
+#         for j in range(len(list_word1)):
+#             if key_word[i] == list_word1[j]:
+#                 word_vector1[i] += 1
+#         for k in range(len(list_word2)):
+#             if key_word[i] == list_word2[k]:
+#                 word_vector2[i] += 1
+
+#     # 输出向量
+#     print(word_vector1)
+#     print(word_vector2)
+#     return word_vector1, word_vector2
+
+
+# def cos_dist(vec1,vec2):
+#     """
+#     :param vec1: 向量1
+#     :param vec2: 向量2
+#     :return: 返回两个向量的余弦相似度
+#     """
+#     dist1 = float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
+#     return dist1
+
+
+# if __name__ == '__main__':
+#     s1 = "允许空值"
+#     s2 = "是否为空"
+#     vec1, vec2 = get_word_vector(s1, s2)
+#     dist1 = cos_dist(vec1, vec2)
+#     print(dist1)
+
+
+from transformers import AutoTokenizer, TFAutoModel
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+# 加载模型
+model_name = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = TFAutoModel.from_pretrained(model_name,
+                                    output_hidden_states=True)  # Whether the model returns all hidden-states.
+
+# 输入测试句子
+utt = ['今天的月亮又大又圆', '月亮真的好漂亮啊', '今天去看电影吧', "爱情睡醒了,天琪抱着小贝进酒店", "侠客行风万里"]
+inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
+outputs = model(inputs)
+hidden_states = outputs[2]  # 获得各个隐藏层输出
+"""
+解释下输出(hidden_states):
+1. The layer number (13 layers)
+2. The batch number (5 sentence) 也就是输入句子的个数
+3. The word / token number (64 tokens in our sentence) 也就是max_length
+4. The hidden unit / feature number (768 features)
+
+疑惑点:
+1.为啥是13层?bert不是12层吗?
+第一层是输入的嵌入层,其余12层才是bert的
+"""
+print("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
+
+layer_i = 0
+print("Number of batches:", len(hidden_states[layer_i]))
+
+batch_i = 0
+print("Number of tokens:", len(hidden_states[layer_i][batch_i]))
+
+token_i = 0
+print("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
+
+# For the 5th token in our sentence, select its feature values from layer 5.
+token_i = 5
+layer_i = 5
+vec = hidden_states[layer_i][batch_i][token_i]
+
+# Plot the values as a histogram to show their distribution.
+plt.figure(figsize=(10, 10))
+plt.hist(vec, bins=200)
+plt.show()
+
+
+# Concatenate the tensors for all layers. We use `stack` here to
+# create a new dimension in the tensor.
+sentence_embeddings = tf.stack(hidden_states, axis=0)  # 在维度0的位置插入,也就是把13放入最前面
+print(f"sentence_embeddings.shape : {sentence_embeddings.shape}")
+
+# 调换维度,使每个词都有13层的嵌入表示
+sentence_embeddings_perm = tf.transpose(sentence_embeddings, perm=[1, 2, 0, 3])
+print(f"sentence_embeddings_perm.shape : {sentence_embeddings_perm.shape}")
+
+# 获取词的稠密向量
+## 第一种方式:拼接后四层的稠密向量
+for sentence_embedding in sentence_embeddings_perm:  # 获取每个句子的embedding
+    print(f"sentence_embedding.shape: {sentence_embedding.shape}")
+    token_vecs_cat = []
+    for token_embedding in sentence_embedding:  # 获取句子每个词的embedding
+        print(f"token_embedding.shape : {token_embedding.shape}")
+        cat_vec = tf.concat([token_embedding[-1], token_embedding[-2], token_embedding[-3], token_embedding[-4]], axis=0)
+        print(f"cat_vec.shape : {cat_vec.shape}")
+        token_vecs_cat.append(cat_vec)
+    print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
+
+## 第二种方式:加和后四层的稠密向量
+for sentence_embedding in sentence_embeddings_perm:  # 获取每个句子的embedding
+    print(f"sentence_embedding.shape: {sentence_embedding.shape}")
+    token_vecs_cat = []
+    for token_embedding in sentence_embedding:  # 获取句子每个词的embedding
+        print(f"token_embedding.shape : {token_embedding.shape}")
+        cat_vec = sum(token_embedding[-4:])
+        print(f"cat_vec.shape : {cat_vec.shape}")
+        token_vecs_cat.append(cat_vec)
+    print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
+
+
+# 获取句子的稠密向量
+## 平均每个token倒数第二层的稠密向量
+token_vecs = sentence_embeddings[-2]
+print(f"token_vecs.shape : {token_vecs.shape}")
+sentences_embedding = tf.reduce_mean(token_vecs, axis=1)
+print(f"sentences_embedding.shape : {sentences_embedding.shape}")
+
+
+# 计算余弦相似度
+## 不同句子间的相似度
+tensor_test = sentences_embedding[0]
+consine_sim_tensor = tf.keras.losses.cosine_similarity(tensor_test, sentences_embedding)
+print(f"consine_sim_tensor : {consine_sim_tensor}")
+
+
+##探讨下相同词bank在不同上下文时其vector的相似度
+utt = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]
+inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=22)
+outputs = model(inputs)
+hidden_states = outputs[2]  # 获得各个隐藏层输出
+tokens_embedding = tf.reduce_sum(hidden_states[-4:], axis=0) # 使用加和方式
+bank_vault = tokens_embedding[0][6]
+bank_robber = tokens_embedding[0][10]
+river_bank = tokens_embedding[0][19]
+consine_sim_tensor = tf.keras.losses.cosine_similarity(bank_vault, [bank_robber, river_bank])
+print(f"consine_sim_tensor : {consine_sim_tensor}")
+# consine_sim_tensor : [-0.93863535 -0.69570863]
+# 可以看出bank_vault(银行金库)和bank_robber(银行抢劫犯)中的bank相似度更高些,合理!
+
+

+ 394 - 0
test.py

@@ -0,0 +1,394 @@
+"""
+对 Word 中数据库设计表构建图谱
+
+字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
+
+字段类型判断:re.match(r'\w+\(\d+\)', s)
+
+1、判断是否标准表 [依据]-> 判断是否包含列类型 -> 找到列
+2、判断列名 [是]-> 判断是否符合列名规范
+3、判断列中文名 [规则]-> 找到列中文名 
+4、判断列类型 [是]-> 判断是否符合列规范
+5、判断主键 [未找到主键]-> 判断是否是ID
+6、判断是否必填字段 [依据]-> 分类[是否为空|是否必填] -> [未找到字段]-> 设定默认值必填
+7、判断列注释 [未找到注释]-> 中文名 [未找到中文名]-> 表名+列名 -> [列类型为bool、时间、流等]-> 添加UUID
+8、字段唯一判断条件 <列名,列类型,注释>
+"""
+import re
+import uuid
+import logging
+from collections import Counter
+
+import docx
+from docx import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import _Cell, Table
+from docx.text.paragraph import Paragraph
+import jieba
+import pandas as pd
+
+import networkx as nx
+import matplotlib.pyplot as plt
+
+# from text2vec import Similarity
+# sim = Similarity()
+
+from py2neo import Node, Graph, Relationship
+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+
+# # 列类型判断
+# with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
+#     COLUMN_TYPES = {i.strip() for i in fp.readlines()}
+# # 列名判断
+# with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
+#     COLUMN_DICT = {i.strip(): 'name' for i in fp.readlines()}
+# # 注释判断
+# with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
+#     REMARK_DICT = {i.strip(): 'remark' for i in fp.readlines()}
+
+
+COLUMN_TYPES = {'int', 'bigint', 'tinyint', 'smallint', 'bigint unsigned', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
+
+TYPE_DICT = {'类型': 'column_type', '数据类型': 'column_type'}
+
+COLUMN_DICT = {'名称': 'name', '字段名': 'name', 'field name': 'name', '字段代码': 'name', '代码': 'name', '物理字段名': 'name'}
+
+C_NAME_DICT = {'字段中文名': 'c_name', '中文含义': 'c_name', '名字': 'c_name', '字段名称': 'c_name', '逻辑字段名': 'c_name'}
+
+REMARK_DICT = {'Alias': 'remark', 'description': 'remark', '说明': 'remark', '描述': 'remark', '备注': 'remark'}
+
+REQUIRED_DICT = {'空/非空': 'required', '可不可以为空': 'required', '是否为空': 'required', '允许空值': 'required', '是否必填': 'required', '空值': 'required'}
+
+PRIMARY_KEY_DICT = {'主键': 'primary_key'}
+
+FOREIGN_KEY_DICT = {'外键': 'foreign_key'}
+
+
+class LoggerHandler(logging.Logger):
+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
+        super().__init__(name)
+        self.setLevel(logging.INFO)
+        self.fmt = logging.Formatter(fmt)
+        self.set_console_handler(console_handler_level)
+
+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
+        ch = logging.StreamHandler()
+        ch.setLevel(console_handler_level)
+        ch.setFormatter(self.fmt)
+        self.addHandler(ch)
+
+
+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
+
+
+class Word:
+    def __init__(self, path: str, draw: bool = False) -> None:
+        self.draw = draw
+        self.doc = Document(path)
+        if draw:
+            self.G = nx.Graph()
+        self.namecount = dict({})
+        self.all_tables = pd.DataFrame()
+
+    def iter_block_item(self, parent):
+        if isinstance(parent, docx.document.Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something error")
+
+        for child in parent_elm.iterchildren():
+            if isinstance(child, CT_P):
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+
+    def parse(self) -> tuple:
+        for block in self.iter_block_item(self.doc):
+            if block.style.name == 'Heading 1' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 2' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 3' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 4' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 5' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 6' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Normal' and block.text:
+                yield ('Normal', block.text.lower())
+            elif block.style.name == 'Table Grid':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip().lower())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.style.name == 'Normal Table':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip().lower())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.text:
+                yield ('Unknow', block)
+
+    # def clean_table(self, raw_table):
+    #     table = []
+    #     dirty_table = []
+    #     while raw_table and '' in raw_table[0]:
+    #         raw_table = raw_table[1:]
+
+    #     # 表格预处理
+    #     rowslen = [len(row) for row in raw_table]
+
+    #     if not rowslen:
+    #         return None
+
+    #     rowlen = Counter(rowslen).most_common(1)[0][0]
+    #     for i,l in enumerate(rowslen):
+    #         if l == rowlen:
+    #             table.append(raw_table[i])
+    #         else:
+    #             dirty_table.append(raw_table[i])
+    #     return table, dirty_table
+
+    def predict(self):
+        for r in self.parse():
+            if r[0] in ['Heading', 'Normal'] and r[1]:
+                tablename = r[1]
+                logger.debug(tablename)
+            elif r[0] == 'Table':
+
+                # table = r[1]
+
+                # table, dirty_table = self.clean_table(r[1])
+                table, dirty_table = self.get_table(r[1])
+
+                if not table:
+                    continue
+
+                # 判断表是否为需要解析的表
+                if any({'fulltype', 'type'} & {self.detect_type(i) for i in table[1]}):
+
+                    ############################### 数据库表名解析 ##############################
+                    if re.search("[a-zA-Z_\d]+", tablename):
+                        table_name = re.search("[a-zA-Z_]+", tablename).group()
+                        try:
+                            table_c_name = re.search('[\u4e00-\u9fa5]{3,}', tablename).group()
+                        except Exception as e:
+                            table_c_name = "未知表"
+                        logger.info(f"得到数据库表,表名:{table_name}\t猜测中文表名:{table_c_name}")
+                    else:
+                        table_name = "UnknowTable"
+                        table_c_name = "未知表"
+                    ############################### 表名解析结束 ###############################
+
+
+                    ############################# 表字段在此修改 ############################
+                    df = pd.DataFrame(table)
+                    df.columns = df.values.tolist()[0]
+                    df.drop([0], inplace=True)
+
+                    # # 表字段修正
+                    df.rename(columns={**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **PRIMARY_KEY_DICT, **FOREIGN_KEY_DICT}, inplace=True)
+
+                    df['table_name'] = table_name
+                    df['table_c_name'] = table_c_name
+
+                    for i in df.columns:
+                        if self.detect_type(df.loc[1, i]) != 'unknow':
+                            df.rename(columns={i: 'column_type'}, inplace=True)
+                            break
+
+                    # 判断字段是否允许为空,允许为空值的不可连接
+                    # for i in df.columns:
+                    #     if sim.get_score(i, '允许值为空') > 0.7:
+                    #         df['unique'] = df[i].apply(lambda x: str(uuid.uuid1()) if x != 'n' else '')
+                    #         break
+                    #     elif sim.get_score(i, '必填') > 0.7:
+                    #         df['unique'] = df[i].apply(lambda x: '' if x != 'n' else str(uuid.uuid1()))
+                    #         break
+                    # if 'unique' not in df.columns:
+                    #     print("无法判断字段是否必填")
+                    #     df['unique'] = ''
+
+                    ############################### 必填字段判断 ###############################
+                    if 'required' not in df.columns:
+                        logger.warning("无法判断字段是否必填,设定默认值必填,所有字段皆可关联!")
+                        df['required'] = ''
+                    ############################### 必填字段判断 ###############################
+
+                    # 注释字段必填
+                    if 'remark' not in df.columns:
+                        if 'c_name' not in df.columns:
+                            logger.warning(f"未找到注释字段,当前字段包含:{df.columns}")
+                            df['remark'] = ''
+                        else:
+                            logger.warning(f"未找到注释字段,使用字段中文名代替!,当前字段包含:{df.columns}")
+                            df['remark'] = df['c_name']
+
+                    ############################### 指定字段不可关联 ###############################
+                    df['remark'] = df['remark'] + df['column_type'].apply(lambda x: str(uuid.uuid1()) if x in ['date', 'datetime', 'blob', 'text'] else '')
+                    df['remark'] = df['remark'] + df['name'].apply(lambda x: str(uuid.uuid1()) if x in ['create_time', 'update_time'] else '')
+                    df['remark'] = df.apply(lambda x: x['name'] if not x['remark'] else x['remark'], axis=1)
+                    ############################### 指定字段不可关联 ###############################
+
+                    ############################### 为空字段不可关联 ###############################
+                    pass
+                    ############################### 为空字段不可关联 ###############################
+
+
+                    if not df.query(' name in ["id", "ID", "Id"] ').empty:
+                        idx = df.query(' name in ["id", "ID", "Id"] ').index[0]
+                        df.loc[idx, 'remark'] = ''.join([table_name, '_id'])
+
+                    # 判断唯一的依据
+                    logger.debug(f'''remark type: {type(df.loc[1, 'remark'])}''')
+                    logger.debug(f'''column_type type: {type(df.loc[1, 'column_type'])}''')
+
+                    df['vec'] = df['name'] + '_' + df['column_type'] + '_' + df['remark']
+                    # ############################### 表字段结束修改 ##############################
+
+
+                    # if self.draw:
+                    #     self.G.add_node(table_name)
+
+                    #     nodelist = [
+                    #         (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
+                    #     ]
+                    #     self.G.add_nodes_from(nodelist)
+
+                    #     edgelist = [
+                    #         (table_name, node[0]) for node in nodelist
+                    #     ]
+                    #     self.G.add_edges_from(edgelist)
+
+                    # 创建表节点
+                    result = graph.nodes.match("表", name = table_name, c_name=table_c_name)
+                    if len(result) > 0:
+                        start_node = result.first()
+                    else:
+                        start_node = Node("表", name=table_name, c_name=table_c_name)
+                        graph.create(start_node)
+
+
+                    # 迭代表字段
+                    for item in df.to_dict(orient ='records'):
+
+                        # 确保属性插入正常,删除非字符串键值
+                        for key in set(item.keys()):
+                            if not isinstance(key, str):
+                                del item[key]
+
+                        ############################# 在此修改 ############################
+                        # 字段名设置合并条件
+                        name = item['name']
+
+                        if not item['vec']:
+                            item['vec'] = table_name + '_' + name
+                        ############################# 结束修改 ############################
+
+                        # 创建字段节点
+                        result = graph.nodes.match("字段", vec = item['vec'])
+
+                        if len(result) > 0:
+                            # 查询到相关字段,使用旧字段
+                            end_node = result.first()
+                            relationship = Relationship(start_node, "related", end_node, **{"name": item['name']})
+                        else:
+                            # 未查询到相关字段,创建节点
+                            end_node = Node("字段", **item)
+                            graph.create(end_node)
+                            relationship = Relationship(start_node, "has", end_node)
+
+                        # 创建表字段关系
+                        graph.merge(relationship)
+                else:
+                    print("非标准表格", table)
+            else:
+                print(r)
+        print(self.all_tables.columns)
+        print(self.all_tables)
+
+    def detect_type(self, text: str):
+        fulltype = re.match(r'(\w+)\(\d+\)', text)
+        if fulltype and (fulltype.group(1) in COLUMN_TYPES):
+            return 'fulltype'
+        elif text in COLUMN_TYPES:
+            return 'type'
+        else:
+            return 'unknow'
+    
+    def get_table(self, raw_table):
+        table = []
+        dirty_table = []
+        has_head = False
+        for row in raw_table:
+            if has_head:
+                table.append(row)
+            elif set(row) & set({**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **FOREIGN_KEY_DICT}.keys()):
+                head = row
+                has_head = True
+            else:
+                dirty_table.append(row)
+
+        # for row in raw_table:
+        #     if get_head:
+        #         table.append(row)
+        #         continue
+
+        #     for col in row:
+        #         fulltype = re.match(r'(\w+)\(\d+\)', col)
+        #         if fulltype and (fulltype.group(1) in COLUMN_TYPES):
+        #             table.append(row)
+        #             get_head = True
+        #             break
+        #         elif col in COLUMN_TYPES:
+        #             table.append(row)
+        #             get_head = True
+        #             break
+        #     else:
+        #         head = row
+
+        if table and (len(head) == len(table[0])) and (len(Counter([len(_) for _ in table]).keys()) == 1):
+            table.insert(0, head)
+            return table, dirty_table
+        else:
+            return None, dirty_table
+
+    def draw(self):
+        if self.draw:
+            nx.draw(self.G, with_labels = True)
+            plt.show()
+        else:
+            return "Draw is not enabled"
+
+
+if __name__ == '__main__':
+    # path = '''data/数据库设计说明书.docx'''
+    path = '''data/数据库设计文档.docx'''
+    path = '''data/数据库设计(1).docx'''
+    path = '''data/数据库设计(2).docx'''
+    path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
+    path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
+    path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
+    path = '''data/租房查询系统_数据库设计说明书_2.0.docx'''
+    path = '''data/url-ukWkMKhnRgCvxVZt.docx'''
+    path = '''data/url-qqp17mI32jTyozQt.docx'''
+    path = '''data/电商-数据库详细设计说明书V0.4.docx'''
+
+    word = Word(path)
+    word.predict()
+

+ 89 - 0
wordsim.py

@@ -0,0 +1,89 @@
+# coding=utf-8
+
+import gzip
+from gensim.models import Word2Vec
+from gensim.test.utils import common_texts
+
+# sentences:我们要分析的语料
+# size:詞向量的大小,默认值是100。
+# window:考慮上下文各自的長度,默认值为 5。
+# sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型,是1则是Skip-Gram模型,默认是0即CBOW模型。
+# hs:即我们的word2vec两个解法的选择了,如果是0, 则是Negative Sampling,是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
+# negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
+# cbow_mean: 仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
+# min_count:單字至少出現的次數,workers:執行緒個數
+# iter: 随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
+# alpha: 在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
+# min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。对于大语料,需要对alpha, min_alpha,iter一起调参,来选择合适的三个值。
+
+model_simple = Word2Vec(sentences=common_texts, window=1,
+                                      min_count=1, workers=4)
+# 傳回 有效的字數及總處理字數
+print(model_simple.train([["hello", "world", "michael"]], total_examples=1, epochs=2))
+
+sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+
+model_simple = Word2Vec(min_count=1)
+model_simple.build_vocab(sentences)  # 建立生字表(vocabulary)
+print(model_simple.train(sentences, total_examples=model_simple.corpus_count
+                         , epochs=model_simple.epochs))
+
+
+"""
+# 載入 OpinRank 語料庫:關於車輛與旅館的評論
+data_file="../nlp-in-practice-master/word2vec/reviews_data.txt.gz"
+
+
+# 讀取 OpinRank 語料庫,並作前置處理
+def read_input(input_file):
+    with gzip.open (input_file, 'rb') as f:
+        for i, line in enumerate (f):
+            # 前置處理
+            yield gensim.utils.simple_preprocess(line)
+
+# 載入 OpinRank 語料庫,分詞
+documents = list(read_input(data_file))
+# print(documents)
+
+
+print(len(documents))
+
+# Word2Vec 模型訓練,約10分鐘
+model = Word2Vec(documents,
+            vector_size=150, window=10,
+            min_count=2, workers=10)
+print(model.train(documents, total_examples=len(documents), epochs=10))
+
+
+# 測試『骯髒』相似詞
+w1 = "dirty"
+print(model.wv.most_similar(positive=w1))
+# positive:相似詞
+
+
+# 測試『禮貌』相似詞
+w1 = ["polite"]
+print(model.wv.most_similar(positive=w1, topn=6))
+# topn:只列出前 n 名
+
+
+# 測試『法國』相似詞
+w1 = ["france"]
+print(model.wv.most_similar(positive=w1, topn=6))
+# topn:只列出前 n 名
+
+
+# 測試『床、床單、枕頭』相似詞及『長椅』相反詞
+w1 = ["bed",'sheet','pillow']
+w2 = ['couch']
+print(model.wv.most_similar(positive=w1, negative=w2, topn=10))
+# negative:相反詞
+
+# 比較兩詞相似機率
+print(model.wv.similarity(w1="dirty", w2="smelly"))
+print(model.wv.similarity(w1="dirty", w2="dirty"))
+print(model.wv.similarity(w1="dirty", w2="clean"))
+
+# 選出較不相似的字詞
+print(model.wv.doesnt_match(["cat", "dog", "france"]))
+"""

+ 24 - 0
wordtable-24-05-10.py

@@ -0,0 +1,24 @@
+import pandas as pd
+
+from py2neo import Node, Graph, Relationship
+
+graph = Graph('http://192.168.1.202:7474/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+io = '''数据表结构.xlsx'''
+
+df = pd.read_excel(io, sheet_name='Sheet1', header=[0])
+
+df.字段描述.fillna(value='', inplace=True)
+df['字段'] = df.字段.str.upper()
+
+for row in df.itertuples():
+    try:
+        start_node = Node("表", name=row.表, c_name=row.表名)
+        end_node   = Node("列", name=row.字段, type=row.字段类型, detail=row.字段描述)
+        relation   = Relationship(start_node, 'has', end_node)
+        graph.merge(start_node, "表", "name")
+        graph.merge(end_node, "列", "name")
+        graph.merge(relation, "值", "名称")
+    except:
+        print(row)

+ 185 - 0
wordtable-24-05-12.py

@@ -0,0 +1,185 @@
+"""
+对 Word 中数据库设计表构建图谱
+
+字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
+"""
+import re
+import logging
+
+import docx
+from docx import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import _Cell, Table
+from docx.text.paragraph import Paragraph
+
+import uuid
+import pandas as pd
+
+import networkx as nx
+import matplotlib.pyplot as plt
+
+from py2neo import Node, Graph, Relationship
+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+
+coltypes = {'int', 'bigint', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
+
+coldict = {'名称': 'colname', '字段名': 'colname', 'Field Name': 'colname', '关联字段': 'alias', 'Alias': 'alias'}
+
+
+class LoggerHandler(logging.Logger):
+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
+        super().__init__(name)
+        self.setLevel(logging.INFO)
+        self.fmt = logging.Formatter(fmt)
+        self.set_console_handler(console_handler_level)
+
+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
+        ch = logging.StreamHandler()
+        ch.setLevel(console_handler_level)
+        ch.setFormatter(self.fmt)
+        self.addHandler(ch)
+
+
+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
+
+
+class Word:
+    def __init__(self, path: str, draw: bool = False) -> None:
+        self.draw = draw
+        self.doc = Document(path)
+        if draw:
+            self.G = nx.Graph()
+
+    def iter_block_item(self, parent):
+        if isinstance(parent, docx.document.Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something error")
+
+        for child in parent_elm.iterchildren():
+            if isinstance(child, CT_P):
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+
+    def parse(self) -> tuple:
+        for block in self.iter_block_item(self.doc):
+            if block.style.name == 'Heading 1' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 2' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 3' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 4' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 5' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 6' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Normal' and block.text:
+                yield ('Normal', block.text)
+            elif block.style.name == 'Table Grid':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.style.name == 'Normal Table':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip())
+                    tables.append(rows)
+                yield ('Table', tables)
+
+    def predict(self):
+        for r in self.parse():
+            if r[0] in ['Heading', 'Normal']:
+                tablename = r[1]
+                logger.debug(tablename)
+            if r[0] == 'Table':
+                # 判断表是否为需要解析的表
+                if any(coltypes & {i.lower() for i in r[1][1]}):
+                    # 数据库表名解析
+                    if re.search("[a-zA-Z_]+", tablename):
+                        tablename = re.search("[a-zA-Z_]+", tablename).group()
+                    logger.info(f"得到数据库表,表名:{tablename}")
+
+                    df = pd.DataFrame(r[1])
+                    df.columns = df.values.tolist()[0]
+                    df.drop([0], inplace=True)
+
+                    if self.draw:
+                        self.G.add_node(tablename)
+
+                        nodelist = [
+                            (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
+                        ]
+                        self.G.add_nodes_from(nodelist)
+
+                        edgelist = [
+                            (tablename, node[0]) for node in nodelist
+                        ]
+                        self.G.add_edges_from(edgelist)
+
+                    # 创建表节点
+                    start_node = Node("表", name=tablename)
+                    graph.merge(start_node, "表", "name")
+
+                    # 表字段修正
+                    df.rename(columns=coldict, inplace=True)
+                    # 别名字段必填
+                    if 'alias' not in df.columns:
+                        logger.warning(f"未找到Alias字段,当前字段包含:{df.columns}")
+                        df['alias'] = ''
+
+                    # 迭代表字段
+                    for item in df.to_dict(orient ='records'):
+
+                        # 确保属性插入正常
+                        for key in set(item.keys()):
+                            if not isinstance(key, str):
+                                del item[key]
+                        
+                        # 字段名设置合并条件
+                        colname = item['colname']
+                        if not item['alias']:
+                            item['alias'] = tablename + '_' + colname
+
+                        # 创建字段节点
+                        end_node = Node("字段", name=colname, **item)
+                        graph.merge(end_node, "字段", "alias")
+
+                        # 创建表字段关系
+                        # relation = Relationship(start_node, 'has', end_node)
+                        HAS = Relationship.type("has")
+                        graph.merge(HAS(start_node, end_node))
+
+    def draw(self):
+        if self.draw:
+            nx.draw(self.G, with_labels = True)
+            plt.show()
+        else:
+            return "Draw is not enabled"
+
+
+if __name__ == '__main__':
+    path = '''数据库设计文档.docx'''
+    path = '''数据库设计(1).docx'''
+    path = '''数据库设计(2).docx'''
+    # path = '''国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
+    # path = '''FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
+    # path = '''中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
+
+    word = Word(path)
+    word.predict()

+ 211 - 0
wordtable-24-05-14.py

@@ -0,0 +1,211 @@
+"""
+对 Word 中数据库设计表构建图谱
+
+字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
+
+字段类型判断:re.match(r'\w+\(\d+\)', s)
+"""
+import re
+import logging
+
+import docx
+from docx import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import _Cell, Table
+from docx.text.paragraph import Paragraph
+
+import uuid
+import pandas as pd
+
+import networkx as nx
+import matplotlib.pyplot as plt
+
+from py2neo import Node, Graph, Relationship
+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+
+# 列类型判断
+with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
+    coltypes = {i.strip() for i in fp.readlines()}
+# 列名判断
+with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
+    coldict = {i.strip(): 'colname' for i in fp.readlines()}
+# 注释判断
+with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
+    aliasdict = {i.strip(): 'alias' for i in fp.readlines()}
+
+
+class LoggerHandler(logging.Logger):
+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
+        super().__init__(name)
+        self.setLevel(logging.INFO)
+        self.fmt = logging.Formatter(fmt)
+        self.set_console_handler(console_handler_level)
+
+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
+        ch = logging.StreamHandler()
+        ch.setLevel(console_handler_level)
+        ch.setFormatter(self.fmt)
+        self.addHandler(ch)
+
+
+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
+
+
+class Word:
+    def __init__(self, path: str, draw: bool = False) -> None:
+        self.draw = draw
+        self.doc = Document(path)
+        if draw:
+            self.G = nx.Graph()
+
+    def iter_block_item(self, parent):
+        if isinstance(parent, docx.document.Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something error")
+
+        for child in parent_elm.iterchildren():
+            if isinstance(child, CT_P):
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+
+    def parse(self) -> tuple:
+        for block in self.iter_block_item(self.doc):
+            if block.style.name == 'Heading 1' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 2' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 3' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 4' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 5' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Heading 6' and block.text:
+                yield ('Heading', block.text)
+            elif block.style.name == 'Normal' and block.text:
+                yield ('Normal', block.text)
+            elif block.style.name == 'Table Grid':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.style.name == 'Normal Table':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip())
+                    tables.append(rows)
+                yield ('Table', tables)
+
+    def predict(self):
+        for r in self.parse():
+            if r[0] in ['Heading', 'Normal']:
+                tablename = r[1]
+                logger.debug(tablename)
+            elif r[0] == 'Table':
+                # 判断表是否为需要解析的表
+                if any(coltypes & {i.lower() for i in r[1][1]}):
+                    # 数据库表名解析
+                    if re.search("[a-zA-Z_\d]+", tablename):
+                        tablename = re.search("[a-zA-Z_]+", tablename).group()
+                        logger.info(f"得到数据库表,表名:{tablename}")
+
+                    ############################# 在此修改 ############################
+                    df = pd.DataFrame(r[1])
+                    df.columns = df.values.tolist()[0]
+                    df.drop([0], inplace=True)
+                    ############################# 结束修改 ############################
+
+                    if self.draw:
+                        self.G.add_node(tablename)
+
+                        nodelist = [
+                            (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
+                        ]
+                        self.G.add_nodes_from(nodelist)
+
+                        edgelist = [
+                            (tablename, node[0]) for node in nodelist
+                        ]
+                        self.G.add_edges_from(edgelist)
+
+                    # 创建表节点
+                    start_node = Node("表", name=tablename)
+                    graph.create(start_node)
+
+                    ############################### 在此修改 ##############################
+                    # 表字段修正
+                    df.rename(columns=coldict, inplace=True)
+                    df.rename(columns=aliasdict, inplace=True)
+                    df['tablename'] = tablename
+                    df['fullname'] = df['tablename'] + '_' + df['colname']
+                    ############################### 结束修改 ##############################
+
+                    # 别名字段必填
+                    if 'alias' not in df.columns:
+                        logger.warning(f"未找到注释字段,当前字段包含:{df.columns}")
+                        df['alias'] = ''
+
+                    # 迭代表字段
+                    for item in df.to_dict(orient ='records'):
+
+                        # 确保属性插入正常,删除非字符串键值
+                        for key in set(item.keys()):
+                            if not isinstance(key, str):
+                                del item[key]
+
+                        ############################# 在此修改 ############################
+                        # 字段名设置合并条件
+                        colname = item['colname']
+
+                        if not item['alias']:
+                            item['alias'] = tablename + '_' + colname
+                        ############################# 结束修改 ############################
+
+                        # 创建字段节点
+                        result = graph.nodes.match("字段", alias = item['alias'])
+
+                        if len(result) > 0:
+                            # 查询到相关字段,使用旧字段
+                            end_node = result.first()
+                            relationship = Relationship(start_node, "foreignkey", end_node, **{"name": item['colname']})
+                        else:
+                            # 未查询到相关字段,创建节点
+                            end_node = Node("字段", name=colname, **item)
+                            graph.create(end_node)
+                            relationship = Relationship(start_node, "has", end_node)
+
+                        # 创建表字段关系
+                        graph.merge(relationship)
+
+    def draw(self):
+        if self.draw:
+            nx.draw(self.G, with_labels = True)
+            plt.show()
+        else:
+            return "Draw is not enabled"
+
+
+if __name__ == '__main__':
+    path = '''data/数据库设计文档.docx'''
+    path = '''data/数据库设计(1).docx'''
+    # path = '''data/数据库设计(2).docx'''
+    # path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
+    # path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
+    # path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
+
+    word = Word(path)
+    word.predict()

+ 394 - 0
wordtable-24-05-17.py

@@ -0,0 +1,394 @@
+"""
+对 Word 中数据库设计表构建图谱
+
+字段名判断依据:1、字段名称不为保留字 2、字母和下划线 3、驼峰命名 4、注意数字
+
+字段类型判断:re.match(r'\w+\(\d+\)', s)
+
+1、判断是否标准表 [依据]-> 判断是否包含列类型 -> 找到列
+2、判断列名 [是]-> 判断是否符合列名规范
+3、判断列中文名 [规则]-> 找到列中文名 
+4、判断列类型 [是]-> 判断是否符合列规范
+5、判断主键 [未找到主键]-> 判断是否是ID
+6、判断是否必填字段 [依据]-> 分类[是否为空|是否必填] -> [未找到字段]-> 设定默认值必填
+7、判断列注释 [未找到注释]-> 中文名 [未找到中文名]-> 表名+列名 -> [列类型为bool、时间、流等]-> 添加UUID
+8、字段唯一判断条件 <列名,列类型,注释>
+"""
+import re
+import uuid
+import logging
+from collections import Counter
+
+import docx
+from docx import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import _Cell, Table
+from docx.text.paragraph import Paragraph
+import jieba
+import pandas as pd
+
+import networkx as nx
+import matplotlib.pyplot as plt
+
+# from text2vec import Similarity
+# sim = Similarity()
+
+from py2neo import Node, Graph, Relationship
+graph = Graph('bolt://192.168.1.150:7687/', user='neo4j', password='password', name="neo4j")
+graph.delete_all()
+
+
+# # 列类型判断
+# with open('dict/columntype.txt', 'r', encoding='utf-8') as fp:
+#     COLUMN_TYPES = {i.strip() for i in fp.readlines()}
+# # 列名判断
+# with open('dict/columnname.txt', 'r', encoding='utf-8') as fp:
+#     COLUMN_DICT = {i.strip(): 'name' for i in fp.readlines()}
+# # 注释判断
+# with open('dict/comment.txt', 'r', encoding='utf-8') as fp:
+#     REMARK_DICT = {i.strip(): 'remark' for i in fp.readlines()}
+
+
+COLUMN_TYPES = {'int', 'bigint', 'tinyint', 'smallint', 'bigint unsigned', 'float', 'double', 'decimal', 'date', 'datetime', 'char', 'varchar', 'text', 'longtext', 'blob', 'bool', 'boolean'}
+
+TYPE_DICT = {'类型': 'column_type', '数据类型': 'column_type'}
+
+COLUMN_DICT = {'名称': 'name', '字段名': 'name', 'field name': 'name', '字段代码': 'name', '代码': 'name', '物理字段名': 'name'}
+
+C_NAME_DICT = {'字段中文名': 'c_name', '中文含义': 'c_name', '名字': 'c_name', '字段名称': 'c_name', '逻辑字段名': 'c_name'}
+
+REMARK_DICT = {'Alias': 'remark', 'description': 'remark', '说明': 'remark', '描述': 'remark', '备注': 'remark'}
+
+REQUIRED_DICT = {'空/非空': 'required', '可不可以为空': 'required', '是否为空': 'required', '允许空值': 'required', '是否必填': 'required', '空值': 'required'}
+
+PRIMARY_KEY_DICT = {'主键': 'primary_key'}
+
+FOREIGN_KEY_DICT = {'外键': 'foreign_key'}
+
+
+class LoggerHandler(logging.Logger):
+    def __init__(self, name: str, console_handler_level: str = logging.INFO, fmt: str = '%(levelname)s: %(asctime)s: %(name)s: %(filename)s: %(lineno)d: %(funcName)s: %(message)s'):
+        super().__init__(name)
+        self.setLevel(logging.INFO)
+        self.fmt = logging.Formatter(fmt)
+        self.set_console_handler(console_handler_level)
+
+    def set_console_handler(self, console_handler_level: str = logging.INFO) -> None:
+        ch = logging.StreamHandler()
+        ch.setLevel(console_handler_level)
+        ch.setFormatter(self.fmt)
+        self.addHandler(ch)
+
+
+logger = LoggerHandler(__name__, fmt='%(levelname)s: %(asctime)s: %(lineno)d: %(funcName)s: %(message)s')
+
+
+class Word:
+    def __init__(self, path: str, draw: bool = False) -> None:
+        self.draw = draw
+        self.doc = Document(path)
+        if draw:
+            self.G = nx.Graph()
+        self.namecount = dict({})
+        self.all_tables = pd.DataFrame()
+
+    def iter_block_item(self, parent):
+        if isinstance(parent, docx.document.Document):
+            parent_elm = parent.element.body
+        elif isinstance(parent, _Cell):
+            parent_elm = parent._tc
+        else:
+            raise ValueError("something error")
+
+        for child in parent_elm.iterchildren():
+            if isinstance(child, CT_P):
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+
+    def parse(self) -> tuple:
+        for block in self.iter_block_item(self.doc):
+            if block.style.name == 'Heading 1' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 2' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 3' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 4' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 5' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Heading 6' and block.text:
+                yield ('Heading', block.text.lower())
+            elif block.style.name == 'Normal' and block.text:
+                yield ('Normal', block.text.lower())
+            elif block.style.name == 'Table Grid':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip().lower())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.style.name == 'Normal Table':
+                tables = []
+                for row in block.rows:
+                    rows = []
+                    for cell in row.cells:
+                        for paragraph in cell.paragraphs:
+                            rows.append(paragraph.text.strip().lower())
+                    tables.append(rows)
+                yield ('Table', tables)
+            elif block.text:
+                yield ('Unknow', block)
+
+    # def clean_table(self, raw_table):
+    #     table = []
+    #     dirty_table = []
+    #     while raw_table and '' in raw_table[0]:
+    #         raw_table = raw_table[1:]
+
+    #     # 表格预处理
+    #     rowslen = [len(row) for row in raw_table]
+
+    #     if not rowslen:
+    #         return None
+
+    #     rowlen = Counter(rowslen).most_common(1)[0][0]
+    #     for i,l in enumerate(rowslen):
+    #         if l == rowlen:
+    #             table.append(raw_table[i])
+    #         else:
+    #             dirty_table.append(raw_table[i])
+    #     return table, dirty_table
+
+    def predict(self):
+        for r in self.parse():
+            if r[0] in ['Heading', 'Normal'] and r[1]:
+                tablename = r[1]
+                logger.debug(tablename)
+            elif r[0] == 'Table':
+
+                # table = r[1]
+
+                # table, dirty_table = self.clean_table(r[1])
+                table, dirty_table = self.get_table(r[1])
+
+                if not table:
+                    continue
+
+                # 判断表是否为需要解析的表
+                if any({'fulltype', 'type'} & {self.detect_type(i) for i in table[1]}):
+
+                    ############################### 数据库表名解析 ##############################
+                    if re.search("[a-zA-Z_\d]+", tablename):
+                        table_name = re.search("[a-zA-Z_]+", tablename).group()
+                        try:
+                            table_c_name = re.search('[\u4e00-\u9fa5]{3,}', tablename).group()
+                        except Exception as e:
+                            table_c_name = "未知表"
+                        logger.info(f"得到数据库表,表名:{table_name}\t猜测中文表名:{table_c_name}")
+                    else:
+                        table_name = "UnknowTable"
+                        table_c_name = "未知表"
+                    ############################### 表名解析结束 ###############################
+
+
+                    ############################# 表字段在此修改 ############################
+                    df = pd.DataFrame(table)
+                    df.columns = df.values.tolist()[0]
+                    df.drop([0], inplace=True)
+
+                    # # 表字段修正
+                    df.rename(columns={**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **PRIMARY_KEY_DICT, **FOREIGN_KEY_DICT}, inplace=True)
+
+                    df['table_name'] = table_name
+                    df['table_c_name'] = table_c_name
+
+                    for i in df.columns:
+                        if self.detect_type(df.loc[1, i]) != 'unknow':
+                            df.rename(columns={i: 'column_type'}, inplace=True)
+                            break
+
+                    # 判断字段是否允许为空,允许为空值的不可连接
+                    # for i in df.columns:
+                    #     if sim.get_score(i, '允许值为空') > 0.7:
+                    #         df['unique'] = df[i].apply(lambda x: str(uuid.uuid1()) if x != 'n' else '')
+                    #         break
+                    #     elif sim.get_score(i, '必填') > 0.7:
+                    #         df['unique'] = df[i].apply(lambda x: '' if x != 'n' else str(uuid.uuid1()))
+                    #         break
+                    # if 'unique' not in df.columns:
+                    #     print("无法判断字段是否必填")
+                    #     df['unique'] = ''
+
+                    ############################### 必填字段判断 ###############################
+                    if 'required' not in df.columns:
+                        logger.warning("无法判断字段是否必填,设定默认值必填,所有字段皆可关联!")
+                        df['required'] = ''
+                    ############################### 必填字段判断 ###############################
+
+                    # 注释字段必填
+                    if 'remark' not in df.columns:
+                        if 'c_name' not in df.columns:
+                            logger.warning(f"未找到注释字段,当前字段包含:{df.columns}")
+                            df['remark'] = ''
+                        else:
+                            logger.warning(f"未找到注释字段,使用字段中文名代替!,当前字段包含:{df.columns}")
+                            df['remark'] = df['c_name']
+
+                    ############################### 指定字段不可关联 ###############################
+                    df['remark'] = df['remark'] + df['column_type'].apply(lambda x: str(uuid.uuid1()) if x in ['date', 'datetime', 'blob', 'text'] else '')
+                    df['remark'] = df['remark'] + df['name'].apply(lambda x: str(uuid.uuid1()) if x in ['create_time', 'update_time'] else '')
+                    df['remark'] = df.apply(lambda x: x['name'] if not x['remark'] else x['remark'], axis=1)
+                    ############################### 指定字段不可关联 ###############################
+
+                    ############################### 为空字段不可关联 ###############################
+                    pass
+                    ############################### 为空字段不可关联 ###############################
+
+
+                    if not df.query(' name in ["id", "ID", "Id"] ').empty:
+                        idx = df.query(' name in ["id", "ID", "Id"] ').index[0]
+                        df.loc[idx, 'remark'] = ''.join([table_name, '_id'])
+
+                    # 判断唯一的依据
+                    logger.debug(f'''remark type: {type(df.loc[1, 'remark'])}''')
+                    logger.debug(f'''column_type type: {type(df.loc[1, 'column_type'])}''')
+
+                    df['vec'] = df['name'] + '_' + df['column_type'] + '_' + df['remark']
+                    # ############################### 表字段结束修改 ##############################
+
+
+                    # if self.draw:
+                    #     self.G.add_node(table_name)
+
+                    #     nodelist = [
+                    #         (uuid.uuid1(), item) for item in df.to_dict(orient ='records')
+                    #     ]
+                    #     self.G.add_nodes_from(nodelist)
+
+                    #     edgelist = [
+                    #         (table_name, node[0]) for node in nodelist
+                    #     ]
+                    #     self.G.add_edges_from(edgelist)
+
+                    # 创建表节点
+                    result = graph.nodes.match("表", name = table_name, c_name=table_c_name)
+                    if len(result) > 0:
+                        start_node = result.first()
+                    else:
+                        start_node = Node("表", name=table_name, c_name=table_c_name)
+                        graph.create(start_node)
+
+
+                    # 迭代表字段
+                    for item in df.to_dict(orient ='records'):
+
+                        # 确保属性插入正常,删除非字符串键值
+                        for key in set(item.keys()):
+                            if not isinstance(key, str):
+                                del item[key]
+
+                        ############################# 在此修改 ############################
+                        # 字段名设置合并条件
+                        name = item['name']
+
+                        if not item['vec']:
+                            item['vec'] = table_name + '_' + name
+                        ############################# 结束修改 ############################
+
+                        # 创建字段节点
+                        result = graph.nodes.match("字段", vec = item['vec'])
+
+                        if len(result) > 0:
+                            # 查询到相关字段,使用旧字段
+                            end_node = result.first()
+                            relationship = Relationship(start_node, "related", end_node, **{"name": item['name']})
+                        else:
+                            # 未查询到相关字段,创建节点
+                            end_node = Node("字段", **item)
+                            graph.create(end_node)
+                            relationship = Relationship(start_node, "has", end_node)
+
+                        # 创建表字段关系
+                        graph.merge(relationship)
+                else:
+                    print("非标准表格", table)
+            else:
+                print(r)
+        print(self.all_tables.columns)
+        print(self.all_tables)
+
+    def detect_type(self, text: str):
+        fulltype = re.match(r'(\w+)\(\d+\)', text)
+        if fulltype and (fulltype.group(1) in COLUMN_TYPES):
+            return 'fulltype'
+        elif text in COLUMN_TYPES:
+            return 'type'
+        else:
+            return 'unknow'
+    
+    def get_table(self, raw_table):
+        table = []
+        dirty_table = []
+        has_head = False
+        for row in raw_table:
+            if has_head:
+                table.append(row)
+            elif set(row) & set({**TYPE_DICT, **COLUMN_DICT, **C_NAME_DICT, **REMARK_DICT, **REQUIRED_DICT, **FOREIGN_KEY_DICT}.keys()):
+                head = row
+                has_head = True
+            else:
+                dirty_table.append(row)
+
+        # for row in raw_table:
+        #     if get_head:
+        #         table.append(row)
+        #         continue
+
+        #     for col in row:
+        #         fulltype = re.match(r'(\w+)\(\d+\)', col)
+        #         if fulltype and (fulltype.group(1) in COLUMN_TYPES):
+        #             table.append(row)
+        #             get_head = True
+        #             break
+        #         elif col in COLUMN_TYPES:
+        #             table.append(row)
+        #             get_head = True
+        #             break
+        #     else:
+        #         head = row
+
+        if table and (len(head) == len(table[0])) and (len(Counter([len(_) for _ in table]).keys()) == 1):
+            table.insert(0, head)
+            return table, dirty_table
+        else:
+            return None, dirty_table
+
+    def draw(self):
+        if self.draw:
+            nx.draw(self.G, with_labels = True)
+            plt.show()
+        else:
+            return "Draw is not enabled"
+
+
+if __name__ == '__main__':
+    # path = '''data/数据库设计说明书.docx'''
+    path = '''data/数据库设计文档.docx'''
+    path = '''data/数据库设计(1).docx'''
+    path = '''data/数据库设计(2).docx'''
+    path = '''data/国家电投人才猎头智能人才库项目-数据库设计说明书.docx'''
+    path = '''data/FJS-OCR 富士通识别平台 数据库设计说明书.docx'''
+    path = '''data/中国跳水队智能辅助教练系统-国际比赛数据 数据库设计说明书.docx'''
+    path = '''data/租房查询系统_数据库设计说明书_2.0.docx'''
+    path = '''data/url-ukWkMKhnRgCvxVZt.docx'''
+    path = '''data/url-qqp17mI32jTyozQt.docx'''
+    path = '''data/电商-数据库详细设计说明书V0.4.docx'''
+
+    word = Word(path)
+    word.predict()
+