xzc
/
education


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
							import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

STOPWORDS = set(stopwords.words('english'))

# # 预处理
def preprocessing(text):
    # text = text.decode("utf-8")
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stops = stopwords.words('english')
    tokens = [token for token in tokens if token not in stops]
    tokens = [token.lower() for token in tokens if len(token)>=3]
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(token) for  token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# 加载数据集
import pandas as pd
# from pymongo import MongoClient
# client = MongoClient("192.168.1.200", 27017)
# col1 = client['education']['test']
# col2 = client['education']['hallowmas']

# file_path = 'train.xlsx'
# df = pd.read_excel(file_path)
# for i in range(len(df.index)):
#     doc = col1.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
#     if doc:
#         content = doc["content"]
#     else:
#         doc = col2.find_one({"url": df.loc[i]['url']}, {"_id": 0, "content": 1})
#         if doc:
#             content = doc["content"]
#     if content:
#         df.loc[i, 'content'] = content
# df.to_excel('dataset.xlsx', sheet_name="Sheet1")

train = pd.read_excel('dataset.xlsx', converters={"content":str, "tags":int})

# 预处理
def text_prepare(text):
    text = text.lower() # 字母小写化
    text = REPLACE_BY_SPACE_RE.sub(' ',text)
    text = BAD_SYMBOLS_RE.sub('',text)
    text = ' '.join([w for w in text.split() if w not in STOPWORDS]) # 删除停用词
    return text

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

X_train, X_test, y_train, y_test = train_test_split(train.content, train.tags, test_size=0.2, random_state=0)

X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]

# X_train = [text_prepare(x) for x in X_train]
# X_test = [text_prepare(x) for x in X_test]

cv = CountVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
feature = cv.fit_transform(X_train)

tfidf = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), token_pattern='(\S+)', stop_words=STOPWORDS)
feature = tfidf.fit_transform(X_train)


# # 朴素贝叶斯分类器
# clf = MultinomialNB().fit(tfidf.transform(X_train), y_train)
# y_nb_pred = clf.predict(tfidf.transform(X_test))
# print('nb_confusion_matrix:')
# print(confusion_matrix(y_test, y_nb_pred))
# print('nb_classification_report:')
# print(classification_report(y_test, y_nb_pred))

 
# # SGD
# clf = SGDClassifier(alpha=0.0001, max_iter=1000).fit(tfidf.transform(X_train), y_train)
# y_SGD_pred = clf.predict(tfidf.transform(X_test))
# print('SGD_confusion_matrix:')
# print(confusion_matrix(y_test, y_SGD_pred))
# print('SGD_classification_report:')
# print(classification_report(y_test, y_SGD_pred))
 
# svm
clf = LinearSVC().fit(tfidf.transform(X_train), y_train)

import joblib
joblib.dump(clf, 'SVC.joblib')
joblib.dump(tfidf, 'TFIDF.joblib')
model = joblib.load('SVC.joblib')
tfidf_model = joblib.load('TFIDF.joblib')
y_svm_pred = model.predict(tfidf_model.transform(X_test))
print('svm_confusion_matrix:')
print(confusion_matrix(y_test, y_svm_pred))
print('svm_classification_report:')
print(classification_report(y_test, y_svm_pred))


# # RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=10)
# clf.fit(tfidf.transform(X_train), y_train)
# y_RF_pred = clf.predict(tfidf.transform(X_test))
# print('RF_confusion_matrix:')
# print(confusion_matrix(y_test, y_RF_pred))
# print('RF_classification_report:')
# print(classification_report(y_test, y_RF_pred))