unaccent = function(text) {
# Supprime les accents
text = gsub("['`^~\"]", " ", text)
text = iconv(text, to="ASCII//TRANSLIT//IGNORE")
text = gsub("['`^~\"]", "", text)
return(text)
}
remove_space = function(some_txt){
# Supprime les espaces inutiles
some_txt = unlist(strsplit(some_txt," "))
if (length(which(some_txt==""))!=0)
some_txt = some_txt[-which(some_txt=="")]
some_txt = paste(some_txt,collapse=" ")
return(some_txt)
}
remove_apostrophe = function(some_txt){
# Supprime les apostrophes et les remplace par des espaces
some_txt = unlist(strsplit(some_txt,"'"))
if (length(which(some_txt==""))!=0)
some_txt = some_txt[-which(some_txt=="")]
some_txt = paste(some_txt,collapse=" ")
some_txt = unlist(strsplit(some_txt,"’"))
if (length(which(some_txt==""))!=0)
some_txt = some_txt[-which(some_txt=="")]
some_txt = paste(some_txt,collapse=" ")
return(some_txt)
}
try.error = function(x){
# Essaie de passer le texte en minuscule, le retourne si elle y arrive, retourne NA sinon
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
nettoyage = function(some_txt){
# Supprime les entités RT
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
# Supprime les @comptes
some_txt = gsub("@\\w+", "", some_txt)
# Supprime les #Hashtags
some_txt = gsub("#\\w+","",some_txt)
# Appelle la fonction remove_apostrophe
some_txt = unlist(lapply(some_txt,remove_apostrophe))
# Supprime la ponctuation
some_txt = gsub("[[:punct:]]", "", some_txt)
# Supprime les nombres
some_txt = gsub("[[:digit:]]", "", some_txt)
# Supprime les liens
some_txt = gsub("http\\w+", "", some_txt)
# Supprime les accents (grâce à la fonction unaccent)
some_txt = unlist(lapply(some_txt,unaccent))
# Supprime les espaces inutiles (grâce à la fonction remove_space)
some_txt = unlist(lapply(some_txt,remove_space))
# Passe le texte en minuscule (grâce à la foncion try.error)
some_txt = unlist(lapply(some_txt, try.error))
# Supprime les NA générés par la fonction try.error
some_txt = some_txt[!is.na(some_txt)]
# Supprime les noms présents dans la table
names(some_txt) = NULL
return(some_txt)
}