import re import nltk from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.linear_model import SGDClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier STOPWORDS = set(stopwords.words('english')) # # 预处理 def preprocessing(text): # text = text.decode("utf-8") tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] stops = stopwords.words('english') tokens = [token for token in tokens if token not in stops] tokens = [token.lower() for token in tokens if len(token)>=3] lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(token) for token in tokens] preprocessed_text = ' '.join(tokens) return preprocessed_text # 加载数据集 import pandas as pd # from pymongo import MongoClient # client = MongoClient("192.168.1.200", 27017) # col1 = client['education']['test'] # col2 = client['education']['hallowmas'] # file_path = 'train.xlsx' # df = pd.read_excel(file_path) # for i in range(len(df.index)): # doc = col1.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1}) # if doc: # content = doc["content"] # else: # doc = col2.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1}) # if doc: # content = doc["content"] # if content: # df.loc[i, 'content'] = content # df.to_excel('dataset.xlsx', sheet_name="Sheet1") train = pd.read_excel('dataset.xlsx', converters={"content":str, "tags":int}) # 预处理 def text_prepare(text): text = text.lower() # 字母小写化 text = REPLACE_BY_SPACE_RE.sub(' ',text) text = BAD_SYMBOLS_RE.sub('',text) text = ' '.join([w for w in text.split() if w not in STOPWORDS]) # 删除停用词 return text REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') X_train, X_test, y_train, y_test = train_test_split(train.content, train.tags, test_size=0.2, random_state=0) X_train = [preprocessing(x) for x in X_train] X_test = [preprocessing(x) for x in X_test] # X_train = [text_prepare(x) for x in X_train] # X_test = [text_prepare(x) for x in X_test] cv = CountVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS) feature = cv.fit_transform(X_train) tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS) feature = tfidf.fit_transform(X_train) # # 朴素贝叶斯分类器 # clf = MultinomialNB().fit(tfidf.transform(X_train), y_train) # y_nb_pred = clf.predict(tfidf.transform(X_test)) # print('nb_confusion_matrix:') # print(confusion_matrix(y_test, y_nb_pred)) # print('nb_classification_report:') # print(classification_report(y_test, y_nb_pred)) # # SGD # clf = SGDClassifier(alpha=0.0001, max_iter=1000).fit(tfidf.transform(X_train), y_train) # y_SGD_pred = clf.predict(tfidf.transform(X_test)) # print('SGD_confusion_matrix:') # print(confusion_matrix(y_test, y_SGD_pred)) # print('SGD_classification_report:') # print(classification_report(y_test, y_SGD_pred)) # svm clf = LinearSVC().fit(tfidf.transform(X_train), y_train) import joblib joblib.dump(clf, 'SVC.joblib') joblib.dump(tfidf, 'TFIDF.joblib') model = joblib.load('SVC.joblib') tfidf_model = joblib.load('TFIDF.joblib') y_svm_pred = model.predict(tfidf_model.transform(X_test)) print('svm_confusion_matrix:') print(confusion_matrix(y_test, y_svm_pred)) print('svm_classification_report:') print(classification_report(y_test, y_svm_pred)) # # RandomForestClassifier # clf = RandomForestClassifier(n_estimators=10) # clf.fit(tfidf.transform(X_train), y_train) # y_RF_pred = clf.predict(tfidf.transform(X_test)) # print('RF_confusion_matrix:') # print(confusion_matrix(y_test, y_RF_pred)) # print('RF_classification_report:') # print(classification_report(y_test, y_RF_pred))