%reload_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
!python -m spacy download fr_core_news_sm > /dev/null
import pandas as pd
#amdt = pd.read_csv('../../../donnees-extraites-assemblee/textes_amendements_nouveaux_articles_plfss_2020-2021.csv')
amdt = pd.read_csv('https://github.com/leximpact/donnees-extraites-assemblee/raw/main/textes_amendements_nouveaux_articles_plfss_2020-2021.csv')
amdt.head(1)
# Keep a random sample
amdt = amdt.sample(n=100,replace=False, random_state=1)
len(amdt)
amdt['texte'] = amdt['dispositif'] + amdt['exposeSommaire']
test = "Bonjour Madame Marianne et Monsieur Coq,\n Comment allez-vous ?"
print(test)
import spacy
#from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
nlp = spacy.load("fr_core_news_sm")
def return_token(sentence):
# Tokeniser la phrase
doc = nlp(sentence)
# Retourner le texte de chaque token
#print(doc[4].pos_)
# Retire la ponctuation et les retours à la ligne
return [X.text for X in doc if X.is_punct is False | X.is_space is False | X.is_stop is False ]
test_token = return_token(test)
test_token
data_words = []
def str_token(text):
global data_words
tokens = return_token(text)
data_words += tokens
return " ".join(tokens)
amdt['txt_sans_stopword'] = amdt['texte'].apply(str_token)
len(data_words)
import pickle
pickle.dump( data_words, open( "./data/amdt_data_words.pickle", "wb" ) )
amdt['txt_sans_stopword'].to_csv('./data/amdt_sans_stopword.csv.gz', compression='gzip')
doc = nlp(test)
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)