library(readr)
library(dplyr) # mutate(), row_number()
<- read_csv('../../../raw_data/financial_news.csv', col_names = c('sentiment', 'text')) |>
df mutate(sentiment = factor(sentiment))
Some very basic ML using Naive-Bayes and the tidymodel framework.
Using the tidyverse, we’ll
- split the df into a training and testing set.
library(rsample) # initial_split(), training(), testing()
library(recipes)
library(parsnip) # naive_bayes(), set_engine()
library(workflows) # workflow()
library(discrim)
library(textrecipes)
library(yardstick)
<- initial_split(df, prop = 0.8, strata = 'sentiment')
list_splits <- training(list_splits)
df_train <- testing(list_splits)
df_test
<- recipe(sentiment ~., data = df_train) |>
list_recipe step_tokenize(text) |>
step_stopwords(text) |>
step_tokenfilter(text, max_tokens = 100) |>
step_tfidf(text)
<- naive_Bayes() |> set_engine('naivebayes') |> set_mode('classification')
mod_nb <- svm_poly() |> set_engine('kernlab') |> set_mode('classification')
mod_svm <- vfold_cv(df_train, v = 5, strata = 'sentiment')
list_cv
<- workflow() |> add_recipe(list_recipe) |> add_model(mod_nb)
wf_nb wf_nb
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: naive_Bayes()
── Preprocessor ────────────────────────────────────────────────────────────────
4 Recipe Steps
• step_tokenize()
• step_stopwords()
• step_tokenfilter()
• step_tfidf()
── Model ───────────────────────────────────────────────────────────────────────
Naive Bayes Model Specification (classification)
Computational engine: naivebayes
<- workflow() |> add_recipe(list_recipe) |> add_model(mod_svm)
wf_svm
<- fit(wf_nb, df_train)
fit_mod_nb <- predict(fit_mod_nb, df_test)
pred_mod_nb <- predict(fit_mod_nb, df_test, type = 'prob')
pred_mod_nb_prob
<- fit(wf_svm, df_train) fit_mod_svm
Setting default kernel parameters
<- predict(fit_mod_svm, df_test)
pred_mod_svm <- predict(fit_mod_svm, df_test, type = 'prob')
pred_mod_svm_prob
bind_cols(df_test, pred_mod_nb) |> conf_mat(sentiment, .pred_class)
Truth
Prediction negative neutral positive
negative 36 22 44
neutral 74 531 188
positive 11 23 41
bind_cols(df_test, pred_mod_svm) |> conf_mat(sentiment, .pred_class)
Truth
Prediction negative neutral positive
negative 22 12 22
neutral 81 540 183
positive 18 24 68
#roc_nb <- bind_cols(df_test, pred_mod_nb_prob) |> roc_curve()