%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
amdt = pd.read_csv('./data/amdt_sans_stopword.csv.gz')
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 15
 
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                              stop_words='french', lowercase=True, 
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(amdt["txt_sans_stopword"])
 
# Build a Latent Dirichlet Allocation Model
# n_jobs = -1 to use all CPU core
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online', n_jobs = -1)
lda_Z = lda_model.fit_transform(data_vectorized)
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [i for i in range(3,4)],
                'learning_decay': [.5], # [.5, .7, .9]
                'max_iter' : [10],
                'learning_method':['online'],
                'n_jobs':[-1]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5], 'learning_method': ['online'],
                         'max_iter': [10], 'n_components': [3],
                         'n_jobs': [-1]})
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
Best Model's Params:  {'learning_decay': 0.5, 'learning_method': 'online', 'max_iter': 10, 'n_components': 3, 'n_jobs': -1}
Best Log Likelihood Score:  -21229.415783180353
Model Perplexity:  2248.6663320542257
model.cv_results_
{'mean_fit_time': array([0.38533525]),
 'std_fit_time': array([0.01228836]),
 'mean_score_time': array([0.02309036]),
 'std_score_time': array([0.00309237]),
 'param_learning_decay': masked_array(data=[0.5],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_learning_method': masked_array(data=['online'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_components': masked_array(data=[3],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_jobs': masked_array(data=[-1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_decay': 0.5,
   'learning_method': 'online',
   'max_iter': 10,
   'n_components': 3,
   'n_jobs': -1}],
 'split0_test_score': array([-18425.07818445]),
 'split1_test_score': array([-21359.64711255]),
 'split2_test_score': array([-20499.78328675]),
 'split3_test_score': array([-20304.64253598]),
 'split4_test_score': array([-25557.92779617]),
 'mean_test_score': array([-21229.41578318]),
 'std_test_score': array([2366.63706352]),
 'rank_test_score': array([1], dtype=int32)}
 
# n_topics = [10, 15, 20, 25, 30]
# log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# # Show graph
# plt.figure(figsize=(12, 8))
# plt.plot(n_topics, log_likelyhoods_5, label='0.5')
# plt.plot(n_topics, log_likelyhoods_7, label='0.7')
# plt.plot(n_topics, log_likelyhoods_9, label='0.9')
# plt.title("Choosing Optimal LDA Model")
# plt.xlabel("Num Topics")
# plt.ylabel("Log Likelyhood Scores")
# plt.legend(title='Learning decay', loc='best')
# plt.show()

How to see the dominant topic in each document?

To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

In the table below, I’ve greened out all major topics in a document and assigned the most dominant topic in its own column.

best_lda_model
LatentDirichletAllocation(learning_decay=0.5, learning_method='online',
                          n_components=3, n_jobs=-1)
import numpy as np
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
data = amdt["txt_sans_stopword"]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics
Topic0 Topic1 Topic2 dominant_topic
Doc0 0.040000 0.040000 0.910000 2
Doc1 0.990000 0.000000 0.000000 0
Doc2 0.010000 0.010000 0.980000 2
Doc3 0.010000 0.990000 0.010000 1
Doc4 0.000000 0.000000 1.000000 2
Doc5 0.010000 0.970000 0.020000 1
Doc6 0.000000 0.000000 0.990000 2
Doc7 0.010000 0.010000 0.980000 2
Doc8 0.010000 0.010000 0.980000 2
Doc9 0.000000 0.990000 0.000000 1
Doc10 0.010000 0.520000 0.480000 1
Doc11 0.000000 0.990000 0.000000 1
Doc12 0.000000 1.000000 0.000000 1
Doc13 0.000000 0.990000 0.000000 1
Doc14 0.990000 0.000000 0.000000 0

Review topics distribution across documents

df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution
Topic Num Num Documents
0 1 47
1 2 28
2 0 25

Affecter un sujet à un texte en utilisant le modèle

text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
[0.00185185 0.00185185 0.00185186 0.00185185 0.00185185 0.1337625
 0.41106784 0.11301411 0.00185185 0.00185185 0.00185186 0.00185186
 0.00185185 0.00185186 0.32178515] 0.9999999999999998
lda_Z.shape
(100, 15)
lda_Z[:1]
array([[0.00740741, 0.00740741, 0.00740741, 0.00740741, 0.00740741,
        0.00740743, 0.00740741, 0.89629623, 0.00740741, 0.00740741,
        0.00740741, 0.00740741, 0.00740741, 0.00740741, 0.00740742]])
 
import pickle
pickle.dump( vectorizer, open( "./data/amdt_vectorizer.pickle", "wb" ) )
pickle.dump( data_vectorized, open( "./data/amdt_data_vectorized.pickle", "wb" ) )
pickle.dump( lda_model, open( "./data/amdt_lda_model.pickle", "wb" ) )
pickle.dump( lda_Z, open( "./data/amdt_lda_Z.pickle", "wb" ) )