jiebasim.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. # import jieba
  2. # import numpy as np
  3. # import re
  4. # def get_word_vector(s1, s2):
  5. # """
  6. # :param s1: 句子1
  7. # :param s2: 句子2
  8. # :return: 返回句子的余弦相似度
  9. # """
  10. # # 分词
  11. # cut1 = jieba.cut(s1)
  12. # cut2 = jieba.cut(s2)
  13. # list_word1 = (','.join(cut1)).split(',')
  14. # list_word2 = (','.join(cut2)).split(',')
  15. # # 列出所有的词,取并集
  16. # key_word = list(set(list_word1 + list_word2))
  17. # # 给定形状和类型的用0填充的矩阵存储向量
  18. # word_vector1 = np.zeros(len(key_word))
  19. # word_vector2 = np.zeros(len(key_word))
  20. # # 计算词频
  21. # # 依次确定向量的每个位置的值
  22. # for i in range(len(key_word)):
  23. # # 遍历key_word中每个词在句子中的出现次数
  24. # for j in range(len(list_word1)):
  25. # if key_word[i] == list_word1[j]:
  26. # word_vector1[i] += 1
  27. # for k in range(len(list_word2)):
  28. # if key_word[i] == list_word2[k]:
  29. # word_vector2[i] += 1
  30. # # 输出向量
  31. # print(word_vector1)
  32. # print(word_vector2)
  33. # return word_vector1, word_vector2
  34. # def cos_dist(vec1,vec2):
  35. # """
  36. # :param vec1: 向量1
  37. # :param vec2: 向量2
  38. # :return: 返回两个向量的余弦相似度
  39. # """
  40. # dist1 = float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
  41. # return dist1
  42. # if __name__ == '__main__':
  43. # s1 = "允许空值"
  44. # s2 = "是否为空"
  45. # vec1, vec2 = get_word_vector(s1, s2)
  46. # dist1 = cos_dist(vec1, vec2)
  47. # print(dist1)
  48. from transformers import AutoTokenizer, TFAutoModel
  49. import tensorflow as tf
  50. import matplotlib.pyplot as plt
  51. # 加载模型
  52. model_name = "bert-base-uncased"
  53. tokenizer = AutoTokenizer.from_pretrained(model_name)
  54. model = TFAutoModel.from_pretrained(model_name,
  55. output_hidden_states=True) # Whether the model returns all hidden-states.
  56. # 输入测试句子
  57. utt = ['今天的月亮又大又圆', '月亮真的好漂亮啊', '今天去看电影吧', "爱情睡醒了,天琪抱着小贝进酒店", "侠客行风万里"]
  58. inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=64)
  59. outputs = model(inputs)
  60. hidden_states = outputs[2] # 获得各个隐藏层输出
  61. """
  62. 解释下输出(hidden_states):
  63. 1. The layer number (13 layers)
  64. 2. The batch number (5 sentence) 也就是输入句子的个数
  65. 3. The word / token number (64 tokens in our sentence) 也就是max_length
  66. 4. The hidden unit / feature number (768 features)
  67. 疑惑点:
  68. 1.为啥是13层?bert不是12层吗?
  69. 第一层是输入的嵌入层,其余12层才是bert的
  70. """
  71. print("Number of layers:", len(hidden_states), " (initial embeddings + 12 BERT layers)")
  72. layer_i = 0
  73. print("Number of batches:", len(hidden_states[layer_i]))
  74. batch_i = 0
  75. print("Number of tokens:", len(hidden_states[layer_i][batch_i]))
  76. token_i = 0
  77. print("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))
  78. # For the 5th token in our sentence, select its feature values from layer 5.
  79. token_i = 5
  80. layer_i = 5
  81. vec = hidden_states[layer_i][batch_i][token_i]
  82. # Plot the values as a histogram to show their distribution.
  83. plt.figure(figsize=(10, 10))
  84. plt.hist(vec, bins=200)
  85. plt.show()
  86. # Concatenate the tensors for all layers. We use `stack` here to
  87. # create a new dimension in the tensor.
  88. sentence_embeddings = tf.stack(hidden_states, axis=0) # 在维度0的位置插入,也就是把13放入最前面
  89. print(f"sentence_embeddings.shape : {sentence_embeddings.shape}")
  90. # 调换维度,使每个词都有13层的嵌入表示
  91. sentence_embeddings_perm = tf.transpose(sentence_embeddings, perm=[1, 2, 0, 3])
  92. print(f"sentence_embeddings_perm.shape : {sentence_embeddings_perm.shape}")
  93. # 获取词的稠密向量
  94. ## 第一种方式:拼接后四层的稠密向量
  95. for sentence_embedding in sentence_embeddings_perm: # 获取每个句子的embedding
  96. print(f"sentence_embedding.shape: {sentence_embedding.shape}")
  97. token_vecs_cat = []
  98. for token_embedding in sentence_embedding: # 获取句子每个词的embedding
  99. print(f"token_embedding.shape : {token_embedding.shape}")
  100. cat_vec = tf.concat([token_embedding[-1], token_embedding[-2], token_embedding[-3], token_embedding[-4]], axis=0)
  101. print(f"cat_vec.shape : {cat_vec.shape}")
  102. token_vecs_cat.append(cat_vec)
  103. print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
  104. ## 第二种方式:加和后四层的稠密向量
  105. for sentence_embedding in sentence_embeddings_perm: # 获取每个句子的embedding
  106. print(f"sentence_embedding.shape: {sentence_embedding.shape}")
  107. token_vecs_cat = []
  108. for token_embedding in sentence_embedding: # 获取句子每个词的embedding
  109. print(f"token_embedding.shape : {token_embedding.shape}")
  110. cat_vec = sum(token_embedding[-4:])
  111. print(f"cat_vec.shape : {cat_vec.shape}")
  112. token_vecs_cat.append(cat_vec)
  113. print(f"len(token_vecs_cat) : {len(token_vecs_cat)}")
  114. # 获取句子的稠密向量
  115. ## 平均每个token倒数第二层的稠密向量
  116. token_vecs = sentence_embeddings[-2]
  117. print(f"token_vecs.shape : {token_vecs.shape}")
  118. sentences_embedding = tf.reduce_mean(token_vecs, axis=1)
  119. print(f"sentences_embedding.shape : {sentences_embedding.shape}")
  120. # 计算余弦相似度
  121. ## 不同句子间的相似度
  122. tensor_test = sentences_embedding[0]
  123. consine_sim_tensor = tf.keras.losses.cosine_similarity(tensor_test, sentences_embedding)
  124. print(f"consine_sim_tensor : {consine_sim_tensor}")
  125. ##探讨下相同词bank在不同上下文时其vector的相似度
  126. utt = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]
  127. inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=22)
  128. outputs = model(inputs)
  129. hidden_states = outputs[2] # 获得各个隐藏层输出
  130. tokens_embedding = tf.reduce_sum(hidden_states[-4:], axis=0) # 使用加和方式
  131. bank_vault = tokens_embedding[0][6]
  132. bank_robber = tokens_embedding[0][10]
  133. river_bank = tokens_embedding[0][19]
  134. consine_sim_tensor = tf.keras.losses.cosine_similarity(bank_vault, [bank_robber, river_bank])
  135. print(f"consine_sim_tensor : {consine_sim_tensor}")
  136. # consine_sim_tensor : [-0.93863535 -0.69570863]
  137. # 可以看出bank_vault(银行金库)和bank_robber(银行抢劫犯)中的bank相似度更高些,合理!