%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
amdt = pd.read_csv('./data/amdt_sans_stopword.csv.gz')
Analyse des thématiques par LDA
LDA : https://fr.wikipedia.org/wiki/Allocation_de_Dirichlet_latente
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
NUM_TOPICS = 15
# vectorizer = CountVectorizer(min_df=5, max_df=0.9,
# stop_words='french', lowercase=True,
# token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(amdt["txt_sans_stopword"])
# Build a Latent Dirichlet Allocation Model
# n_jobs = -1 to use all CPU core
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online', n_jobs = -1)
lda_Z = lda_model.fit_transform(data_vectorized)
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [i for i in range(3,4)],
'learning_decay': [.5], # [.5, .7, .9]
'max_iter' : [10],
'learning_method':['online'],
'n_jobs':[-1]}
# Init the Model
lda = LatentDirichletAllocation()
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
model.cv_results_
# n_topics = [10, 15, 20, 25, 30]
# log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]
# # Show graph
# plt.figure(figsize=(12, 8))
# plt.plot(n_topics, log_likelyhoods_5, label='0.5')
# plt.plot(n_topics, log_likelyhoods_7, label='0.7')
# plt.plot(n_topics, log_likelyhoods_9, label='0.9')
# plt.title("Choosing Optimal LDA Model")
# plt.xlabel("Num Topics")
# plt.ylabel("Log Likelyhood Scores")
# plt.legend(title='Learning decay', loc='best')
# plt.show()
How to see the dominant topic in each document?
To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.
In the table below, I’ve greened out all major topics in a document and assigned the most dominant topic in its own column.
best_lda_model
import numpy as np
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
data = amdt["txt_sans_stopword"]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
color = 'green' if val > .1 else 'black'
return 'color: {col}'.format(col=color)
def make_bold(val):
weight = 700 if val > .1 else 400
return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution
text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
lda_Z.shape
lda_Z[:1]
import pickle
pickle.dump( vectorizer, open( "./data/amdt_vectorizer.pickle", "wb" ) )
pickle.dump( data_vectorized, open( "./data/amdt_data_vectorized.pickle", "wb" ) )
pickle.dump( lda_model, open( "./data/amdt_lda_model.pickle", "wb" ) )
pickle.dump( lda_Z, open( "./data/amdt_lda_Z.pickle", "wb" ) )