%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pickle
data_vectorized = pickle.load( open( "./data/amdt_data_vectorized.pickle", "rb" ) )
vectorizer = pickle.load( open( "./data/amdt_vectorizer.pickle", "rb" ) )
lda_Z = pickle.load( open( "./data/amdt_lda_Z.pickle", "rb" ) )
lda_model = pickle.load( open( "./data/amdt_lda_model.pickle", "rb" ) )
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = lda_Z.shape[1]
 
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
#                              stop_words='french', lowercase=True, 
#                              token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
#vectorizer = CountVectorizer()
lda_Z.shape
(100, 15)
text = "Pour les entreprises de moins de 11 salariés, l’employeur est autorisé à attribuer une fois par an, à l’ensemble des salariés qu’il emploie, la prime exceptionnelle de pouvoir d’achat, dans les conditions prévues au V. »II. – En conséquence, compléter cet article par les deux alinéas suivants"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())
[0.00185185 0.00185185 0.00185186 0.00185185 0.00185185 0.1337625
 0.41106784 0.11301411 0.00185185 0.00185185 0.00185186 0.00185186
 0.00185185 0.00185186 0.32178515] 0.9999999999999998
len(vectorizer.get_feature_names())
3223
import matplotlib.pyplot as plt

# code from https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(3, 5, figsize=(30, 25), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()
n_top_words = 20

plot_top_words(lda_model, vectorizer.get_feature_names(), n_top_words,  'Liste des 15 thématiques')

WIP : Interractive plot

TODO : Malheureusement ça ne focntionne pas encore sur tout les postes :(

 
# pyLDAvis.enable_notebook()
# #pyLDAvis.enable_notebook(local=True)
# panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
# panel
/home/ben/.virtualenvs/lex/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:691: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  warnings.warn(
 
/home/ben/.virtualenvs/lex/lib/python3.8/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
  and should_run_async(code)