123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- import re
- import nltk
- from nltk.stem import WordNetLemmatizer
- from nltk.corpus import stopwords
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn.metrics import confusion_matrix
- from sklearn.metrics import classification_report
- from sklearn.linear_model import SGDClassifier
- from sklearn.svm import LinearSVC
- from sklearn.ensemble import RandomForestClassifier
- STOPWORDS = set(stopwords.words('english'))
- # # 预处理
- def preprocessing(text):
- # text = text.decode("utf-8")
- tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
- stops = stopwords.words('english')
- tokens = [token for token in tokens if token not in stops]
- tokens = [token.lower() for token in tokens if len(token)>=3]
- lmtzr = WordNetLemmatizer()
- tokens = [lmtzr.lemmatize(token) for token in tokens]
- preprocessed_text = ' '.join(tokens)
- return preprocessed_text
- # 加载数据集
- import pandas as pd
- # from pymongo import MongoClient
- # client = MongoClient("192.168.1.200", 27017)
- # col1 = client['education']['test']
- # col2 = client['education']['hallowmas']
- # file_path = 'train.xlsx'
- # df = pd.read_excel(file_path)
- # for i in range(len(df.index)):
- # doc = col1.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
- # if doc:
- # content = doc["content"]
- # else:
- # doc = col2.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
- # if doc:
- # content = doc["content"]
- # if content:
- # df.loc[i, 'content'] = content
- # df.to_excel('dataset.xlsx', sheet_name="Sheet1")
- train = pd.read_excel('dataset.xlsx', converters={"content":str, "tags":int})
- # 预处理
- def text_prepare(text):
- text = text.lower() # 字母小写化
- text = REPLACE_BY_SPACE_RE.sub(' ',text)
- text = BAD_SYMBOLS_RE.sub('',text)
- text = ' '.join([w for w in text.split() if w not in STOPWORDS]) # 删除停用词
- return text
- REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
- BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
- X_train, X_test, y_train, y_test = train_test_split(train.content, train.tags, test_size=0.2, random_state=0)
- X_train = [preprocessing(x) for x in X_train]
- X_test = [preprocessing(x) for x in X_test]
- # X_train = [text_prepare(x) for x in X_train]
- # X_test = [text_prepare(x) for x in X_test]
- cv = CountVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
- feature = cv.fit_transform(X_train)
- tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
- feature = tfidf.fit_transform(X_train)
- # # 朴素贝叶斯分类器
- # clf = MultinomialNB().fit(tfidf.transform(X_train), y_train)
- # y_nb_pred = clf.predict(tfidf.transform(X_test))
- # print('nb_confusion_matrix:')
- # print(confusion_matrix(y_test, y_nb_pred))
- # print('nb_classification_report:')
- # print(classification_report(y_test, y_nb_pred))
-
- # # SGD
- # clf = SGDClassifier(alpha=0.0001, max_iter=1000).fit(tfidf.transform(X_train), y_train)
- # y_SGD_pred = clf.predict(tfidf.transform(X_test))
- # print('SGD_confusion_matrix:')
- # print(confusion_matrix(y_test, y_SGD_pred))
- # print('SGD_classification_report:')
- # print(classification_report(y_test, y_SGD_pred))
-
- # svm
- clf = LinearSVC().fit(tfidf.transform(X_train), y_train)
- import joblib
- joblib.dump(clf, 'SVC.joblib')
- joblib.dump(tfidf, 'TFIDF.joblib')
- model = joblib.load('SVC.joblib')
- tfidf_model = joblib.load('TFIDF.joblib')
- y_svm_pred = model.predict(tfidf_model.transform(X_test))
- print('svm_confusion_matrix:')
- print(confusion_matrix(y_test, y_svm_pred))
- print('svm_classification_report:')
- print(classification_report(y_test, y_svm_pred))
- # # RandomForestClassifier
- # clf = RandomForestClassifier(n_estimators=10)
- # clf.fit(tfidf.transform(X_train), y_train)
- # y_RF_pred = clf.predict(tfidf.transform(X_test))
- # print('RF_confusion_matrix:')
- # print(confusion_matrix(y_test, y_RF_pred))
- # print('RF_classification_report:')
- # print(classification_report(y_test, y_RF_pred))
|