%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
data_vectorized = pickle.load( open( "./data/amdt_data_vectorized.pickle", "rb" ) )
vectorizer = pickle.load( open( "./data/amdt_vectorizer.pickle", "rb" ) )
lda_Z = pickle.load( open( "./data/amdt_lda_Z.pickle", "rb" ) )
lda_model = pickle.load( open( "./data/amdt_lda_model.pickle", "rb" ) )
Analyse des thématiques par LDA
LDA : https://fr.wikipedia.org/wiki/Allocation_de_Dirichlet_latente
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
NUM_TOPICS = lda_Z.shape[1]
# vectorizer = CountVectorizer(min_df=5, max_df=0.9,
# stop_words='french', lowercase=True,
# token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
#vectorizer = CountVectorizer()
lda_Z.shape
text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
len(vectorizer.get_feature_names())
import matplotlib.pyplot as plt
# code from https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(3, 5, figsize=(30, 25), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
top_features = [feature_names[i] for i in top_features_ind]
weights = topic[top_features_ind]
ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f'Topic {topic_idx +1}',
fontdict={'fontsize': 30})
ax.invert_yaxis()
ax.tick_params(axis='both', which='major', labelsize=20)
for i in 'top right left'.split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)
plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.show()
n_top_words = 20
plot_top_words(lda_model, vectorizer.get_feature_names(), n_top_words, 'Liste des 15 thématiques')
# pyLDAvis.enable_notebook()
# #pyLDAvis.enable_notebook(local=True)
# panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
# panel