%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pickle
data_vectorized = pickle.load( open( "./data/amdt_data_vectorized.pickle", "rb" ) )
vectorizer = pickle.load( open( "./data/amdt_vectorizer.pickle", "rb" ) )
lda_Z = pickle.load( open( "./data/amdt_lda_Z.pickle", "rb" ) )
lda_model = pickle.load( open( "./data/amdt_lda_model.pickle", "rb" ) )

Analyse des thématiques par LDA

LDA : https://fr.wikipedia.org/wiki/Allocation_de_Dirichlet_latente

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = lda_Z.shape[1]
 
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                              stop_words='french', lowercase=True, 
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
#vectorizer = CountVectorizer()

lda_Z.shape

(100, 15)

text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.00185185 0.00185185 0.00185186 0.00185185 0.00185185 0.1337625
 0.41106784 0.11301411 0.00185185 0.00185185 0.00185186 0.00185186
 0.00185185 0.00185186 0.32178515] 0.9999999999999998

len(vectorizer.get_feature_names())

3223

import matplotlib.pyplot as plt

# code from https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(3, 5, figsize=(30, 25), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

n_top_words = 20

plot_top_words(lda_model, vectorizer.get_feature_names(), n_top_words,  'Liste des 15 thématiques')

WIP : Interractive plot

TODO : Malheureusement ça ne focntionne pas encore sur tout les postes :(

 
# pyLDAvis.enable_notebook()
# #pyLDAvis.enable_notebook(local=True)
# panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
# panel

/home/ben/.virtualenvs/lex/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:691: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  warnings.warn(

/home/ben/.virtualenvs/lex/lib/python3.8/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)

Etude des sujets des amendements PLFSS - Graph

Analyse des thématiques par LDA

WIP : Interractive plot