svc.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import re
  2. import nltk
  3. from nltk.stem import WordNetLemmatizer
  4. from nltk.corpus import stopwords
  5. from sklearn.model_selection import train_test_split
  6. from sklearn.feature_extraction.text import CountVectorizer
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from sklearn.naive_bayes import MultinomialNB
  9. from sklearn.metrics import confusion_matrix
  10. from sklearn.metrics import classification_report
  11. from sklearn.linear_model import SGDClassifier
  12. from sklearn.svm import LinearSVC
  13. from sklearn.ensemble import RandomForestClassifier
  14. STOPWORDS = set(stopwords.words('english'))
  15. # # 预处理
  16. def preprocessing(text):
  17. # text = text.decode("utf-8")
  18. tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
  19. stops = stopwords.words('english')
  20. tokens = [token for token in tokens if token not in stops]
  21. tokens = [token.lower() for token in tokens if len(token)>=3]
  22. lmtzr = WordNetLemmatizer()
  23. tokens = [lmtzr.lemmatize(token) for token in tokens]
  24. preprocessed_text = ' '.join(tokens)
  25. return preprocessed_text
  26. # 加载数据集
  27. import pandas as pd
  28. # from pymongo import MongoClient
  29. # client = MongoClient("192.168.1.200", 27017)
  30. # col1 = client['education']['test']
  31. # col2 = client['education']['hallowmas']
  32. # file_path = 'train.xlsx'
  33. # df = pd.read_excel(file_path)
  34. # for i in range(len(df.index)):
  35. # doc = col1.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
  36. # if doc:
  37. # content = doc["content"]
  38. # else:
  39. # doc = col2.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
  40. # if doc:
  41. # content = doc["content"]
  42. # if content:
  43. # df.loc[i, 'content'] = content
  44. # df.to_excel('dataset.xlsx', sheet_name="Sheet1")
  45. train = pd.read_excel('dataset.xlsx', converters={"content":str, "tags":int})
  46. # 预处理
  47. def text_prepare(text):
  48. text = text.lower() # 字母小写化
  49. text = REPLACE_BY_SPACE_RE.sub(' ',text)
  50. text = BAD_SYMBOLS_RE.sub('',text)
  51. text = ' '.join([w for w in text.split() if w not in STOPWORDS]) # 删除停用词
  52. return text
  53. REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
  54. BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
  55. X_train, X_test, y_train, y_test = train_test_split(train.content, train.tags, test_size=0.2, random_state=0)
  56. X_train = [preprocessing(x) for x in X_train]
  57. X_test = [preprocessing(x) for x in X_test]
  58. # X_train = [text_prepare(x) for x in X_train]
  59. # X_test = [text_prepare(x) for x in X_test]
  60. cv = CountVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
  61. feature = cv.fit_transform(X_train)
  62. tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
  63. feature = tfidf.fit_transform(X_train)
  64. # # 朴素贝叶斯分类器
  65. # clf = MultinomialNB().fit(tfidf.transform(X_train), y_train)
  66. # y_nb_pred = clf.predict(tfidf.transform(X_test))
  67. # print('nb_confusion_matrix:')
  68. # print(confusion_matrix(y_test, y_nb_pred))
  69. # print('nb_classification_report:')
  70. # print(classification_report(y_test, y_nb_pred))
  71. # # SGD
  72. # clf = SGDClassifier(alpha=0.0001, max_iter=1000).fit(tfidf.transform(X_train), y_train)
  73. # y_SGD_pred = clf.predict(tfidf.transform(X_test))
  74. # print('SGD_confusion_matrix:')
  75. # print(confusion_matrix(y_test, y_SGD_pred))
  76. # print('SGD_classification_report:')
  77. # print(classification_report(y_test, y_SGD_pred))
  78. # svm
  79. clf = LinearSVC().fit(tfidf.transform(X_train), y_train)
  80. import joblib
  81. joblib.dump(clf, 'SVC.joblib')
  82. joblib.dump(tfidf, 'TFIDF.joblib')
  83. model = joblib.load('SVC.joblib')
  84. tfidf_model = joblib.load('TFIDF.joblib')
  85. y_svm_pred = model.predict(tfidf_model.transform(X_test))
  86. print('svm_confusion_matrix:')
  87. print(confusion_matrix(y_test, y_svm_pred))
  88. print('svm_classification_report:')
  89. print(classification_report(y_test, y_svm_pred))
  90. # # RandomForestClassifier
  91. # clf = RandomForestClassifier(n_estimators=10)
  92. # clf.fit(tfidf.transform(X_train), y_train)
  93. # y_RF_pred = clf.predict(tfidf.transform(X_test))
  94. # print('RF_confusion_matrix:')
  95. # print(confusion_matrix(y_test, y_RF_pred))
  96. # print('RF_classification_report:')
  97. # print(classification_report(y_test, y_RF_pred))