library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)
# Data wrangling done in exp6_setup_nb.Rmd
# Read in data
ratings_data_incl <- read.csv(file = "exp6_ratings.csv")

# Remove excluded participants' data
ratings_data_incl %>%
  subset(!(prolific_id %in% excluded)) %>%
  droplevels -> ratings_data_excl

ratings_data_excl$env <- relevel(ratings_data_excl$env, "vt")
ratings_data_excl$env <- relevel(ratings_data_excl$env, "ee")
ratings_data_excl$env <- relevel(ratings_data_excl$env, "complement")
ratings_data_excl$length <- relevel(ratings_data_excl$length, "short")
ratings_data_excl$structure <- relevel(ratings_data_excl$structure, "non-island")

# Add z-scores (by participant) for use later, INCLUDING burn-in trials
ratings_data_excl %>%
  subset(type == "experimental" | type == "filler" | type == "burn-in" | type == "burn-in_filler") %>%
  droplevels %>%
  group_by(prolific_id) %>%
  mutate(z_rating_all = scale(rating)) -> ratings_data_all

# Add z-scores (by participant) for use later, EXCLUDING burn-in trials
ratings_data_all %>%
  subset(type == "experimental" | type == "filler") %>%
  droplevels %>%
  group_by(prolific_id) %>%
  mutate(z_rating = scale(rating)) -> ratings_data

# Get experiment data
ratings_data %>%
  subset(type == "experimental") %>%
  droplevels -> exp_data

# Get filler data
ratings_data %>%
  subset(type == "filler") %>%
  droplevels -> filler_data

1 Descriptive stats


# Get summary of experiment data
exp_data %>%
  group_by(env,
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary
print(summary)
# Reorder levels
exp_data$env <- relevel(exp_data$env, "ee")
exp_data$env <- relevel(exp_data$env, "vt")
exp_data$length <- relevel(exp_data$length, "short")
exp_data$structure <- relevel(exp_data$structure, "non-island")

# Get summary of experiment data
exp_data %>%
  group_by(env,
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary
print(summary)
# Make a descriptive plot
summary_forplot <- summary
summary_forplot[7,] <- summary_forplot[5,]
summary_forplot[8,] <- summary_forplot[6,]
summary_forplot$environment <- summary_forplot$env
summary_forplot$environment[5:6] <- c("vt", "vt")
summary_forplot$environment[7:8] <- c("ee", "ee")
summary_forplot$environment <- recode(summary_forplot$environment, vt = "VT", ee = "EE")

summary_forplot %>%
  ggplot(aes(x = length,
             y = mean_rating,
             ymin = 1,
             ymax = 6,
             color = structure,
             group = structure)) +
  facet_grid(.~environment) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) +
  geom_point(aes(col = structure),
             size = 2) +
  scale_y_continuous(breaks = seq(1:6)) +
  scale_color_discrete("Structure",
                       labels = c("Non-island",
                                  "Island")) +
  labs(x = "Length",
       y = "Mean rating") -> plot_summary
print(plot_summary)


# Make an alternative plot
summary_forplot %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Main items") -> plot_summary_collapsed
print(plot_summary_collapsed)

1.1 Breakdown by verb

exp_data %>%
  subset(structure == "island") %>%
  group_by(length,
           structure,
           verb, env) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> vbv_summary
print(vbv_summary)

for (vb in 1:length(unique(vbv_summary$verb))) {
  which.verb <- unique(vbv_summary$verb)[vb]
  vbv_summary %>%
    subset(verb == which.verb) %>%
    ggplot(aes(x = length,
               y = mean_rating,
               color = structure,
               group = structure,
               ymin = 1,
               ymax = 6)) +
    scale_y_continuous(breaks = seq(1:6)) +
    geom_point(data = subset(summary,
                             structure == "non-island"),
               aes(x = length,
                   y = mean_rating)) +
    geom_errorbar(data = subset(summary,
                                structure == "non-island"),
                  aes(ymin = mean_rating - se_rating,
                      ymax = mean_rating + se_rating),
                  width = 0.15) +
    geom_point(aes(col = structure),
               size = 2) +
    geom_errorbar(aes(ymin = mean_rating - se_rating,
                      ymax = mean_rating + se_rating),
                  width = 0.15) +
    labs(title = paste0(as.character(which.verb), " (", vbv_summary$env[vbv_summary$verb == which.verb], ")"),
         x = "Length",
         y = "Mean rating") -> plot
  print(plot)
}

1.1.1 DD scores by verb

# Save baseline data
exp_data %>%
  subset(structure == "non-island") %>%
  droplevels %>%
  group_by(length) %>%
  summarize(mean_z_rating = mean(z_rating)) -> baseline_z_ratings

# Get DD scores for each verb
exp_data %>%
  subset(structure == "island") %>%
  droplevels %>%
  group_by(verb, env) %>%
  summarize(mean_z_rating = mean(z_rating),
            isl_sh = mean(z_rating[length == "short"]),
            isl_l = mean(z_rating[length == "long"]),
            nisl_sh = baseline_z_ratings$mean_z_rating[1],
            nisl_l = baseline_z_ratings$mean_z_rating[2],
            d1 = nisl_l - isl_l,
            d2 = nisl_sh - isl_sh,
            dd = d1 - d2) -> vbv_dd
print(vbv_dd)

# Plot verbs by DD score
vbv_dd %>%
  subset(select = c(verb, env, dd)) %>%
  ggplot(aes(x = reorder(verb, dd),
             y = dd,
             fill = env)) +
  geom_col() + 
  scale_fill_discrete(name = "Environment",
                      labels = c("VT", "EE")) +
  labs(x = "Verb",
       y = "DD score")

1.2 Burn-in items (vs. main items)

ratings_data_excl %>%
  subset(type == "burn-in") %>%
  group_by(env, 
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary_burn_in

print(summary_burn_in)
summary_burn_in %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Burn-in items") -> plot_summary_burn_in
print(plot_summary_burn_in)

# Compare
print(plot_summary_collapsed)

# Considering burn-in and main items together
ratings_data_excl %>%
  subset(type == "experimental" | type == "burn-in") %>%
  droplevels %>%
  group_by(env,
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary_combined
print(summary_combined)
# Plot
summary_combined %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Combined burn-in and main items") -> plot_summary_combined
print(plot_summary_combined)

1.3 Trial order effects

ratings_data_excl %>%
  subset(type == "experimental" | type == "burn-in") %>%
  droplevels %>%
  subset(length == "long") %>%
  droplevels -> ratings_data_excl_combined

# Order the data frame by prolific id and trial
ratings_data_excl_combined_ordered <- ratings_data_excl_combined[order(ratings_data_excl_combined$epoch_time,
                                                                       ratings_data_excl_combined$env,
                                                                       ratings_data_excl_combined$trial),]

# Add new col for equalized trial
# Get number of observations per participant
obs <- nrow(ratings_data_excl_combined_ordered[ratings_data_excl_combined_ordered$epoch_time ==  min(ratings_data_excl_combined_ordered$epoch_time),])
ratings_data_excl_combined_ordered$trial_eq <- 1:(obs/3)

# Plot the ratings and draw a line
ratings_data_excl_combined_ordered %>%
  ggplot(aes(x = trial_eq,
             y = z_rating,
             color = env,
             group = env)) +
  geom_jitter() + 
  geom_smooth(method = "lm") -> plot_trial_order
print(plot_trial_order)

2 CLMs

2.1 Simple effects analysis

library(ordinal)

Attaching package: ‘ordinal’

The following object is masked from ‘package:dplyr’:

    slice
# Make sure nominal variables are coded as factors
exp_data$rating %<>% as.factor
exp_data$env %<>% as.factor
exp_data$structure %<>% as.factor
exp_data$length %<>% as.factor

# Sum code structure and length factors
contrasts(exp_data$structure) <- c(-0.5, 0.5)
contrasts(exp_data$length) <- c(-0.5, 0.5)
# Contrasts for env factor should be dummy-coded correctly by default after reordering the factors

clm(data = exp_data,
    formula = rating ~ env * length) -> clm_analysis
summary(clm_analysis)
formula: rating ~ env * length
data:    exp_data

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
envee          -1.3305     0.1696  -7.844 4.37e-15 ***
envvt          -1.6752     0.1748  -9.586  < 2e-16 ***
length1         0.2813     0.2370   1.187    0.235    
envee:length1  -2.6726     0.3394  -7.874 3.42e-15 ***
envvt:length1  -2.5573     0.3438  -7.437 1.03e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Threshold coefficients:
    Estimate Std. Error z value
1|2  -3.8944     0.1971 -19.754
2|3  -2.6209     0.1634 -16.044
3|4  -1.5535     0.1411 -11.013
4|5  -0.5841     0.1270  -4.599
5|6   0.5289     0.1263   4.188

2.2 Mixed effects analysis

# Save the ratings data to run on hb.ucsc.edu
saveRDS(exp_data,
        file = "ratings_data_for_clmm.rds")

# Run the following clmm on the cluster:
# clmm(data = exp_data,
#      formula = rating ~ env * length +
#      (1 + env * length | prolific_id) +
#      (1 + env * length | item)) -> clmm_analysis
# saveRDS(clmm_analysis, "exp6_clmm.rds")
 
# Read in RDS from cluster
clmm_analysis <- readRDS(file = "exp6_clmm.rds")
summary(clmm_analysis)
Variance-covariance matrix of the parameters is not defined
Cumulative Link Mixed Model fitted with the Laplace approximation

formula: rating ~ env * length + (1 + env * length | prolific_id) + (1 +      env * length | item)
data:    ratings_data_excl_baselines

Random effects:
 Groups      Name             Variance Std.Dev. Corr                               
 prolific_id (Intercept)      7.2111   2.6854                                      
             envee            0.5457   0.7387   -0.892                             
             envvt            1.5785   1.2564   -0.523  0.840                      
             lengthlong       1.0150   1.0075   -0.856  0.739  0.293               
             envee:lengthlong 2.0579   1.4345   -0.232  0.445  0.658 -0.077        
             envvt:lengthlong 1.5228   1.2340   -0.373  0.262  0.116  0.211  0.702 
 item        (Intercept)      0.3197   0.5654                                      
             envee            0.7320   0.8555   -0.292                             
             envvt            0.2181   0.4670    0.322 -0.677                      
             lengthlong       3.2911   1.8141    0.224 -0.828  0.577               
             envee:lengthlong 0.9801   0.9900   -0.238  0.647 -0.465 -0.961        
             envvt:lengthlong 7.2584   2.6941   -0.418  0.927 -0.497 -0.911  0.808 
Number of groups:  prolific_id 30,  item 24 

Coefficients:
              Estimate Std. Error z value Pr(>|z|)
envee          -2.3324         NA      NA       NA
envvt          -2.8870         NA      NA       NA
length1         0.5469         NA      NA       NA
envee:length1  -4.7146         NA      NA       NA
envvt:length1  -4.3064         NA      NA       NA

Threshold coefficients:
    Estimate Std. Error z value
1|2  -6.5216         NA      NA
2|3  -4.5322         NA      NA
3|4  -2.8098         NA      NA
4|5  -1.1472         NA      NA
5|6   0.9454         NA      NA
---
title: "RC subextraction in English: Experiment 6 notebook"
author: "Jake W. Vincent (&#106;&#119;&#118;&#105;&#110;&#99;&#101;n&#64;&#117;c&#115;c.e&#100;u)"
output:
  html_notebook: 
    fig_caption: yes
    number_sections: yes
    theme: flatly
    code_folding: show
    css: style.css
    toc: true
  pdf_document: 
    keep_tex: yes
bibliography: ~/Documents/library.bib
---
```{r setup}
library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)
```

```{r}
# Data wrangling done in exp6_setup_nb.Rmd
# Read in data
ratings_data_incl <- read.csv(file = "exp6_ratings.csv")

# Remove excluded participants' data
ratings_data_incl %>%
  subset(!(prolific_id %in% excluded)) %>%
  droplevels -> ratings_data_excl

ratings_data_excl$env <- relevel(ratings_data_excl$env, "vt")
ratings_data_excl$env <- relevel(ratings_data_excl$env, "ee")
ratings_data_excl$env <- relevel(ratings_data_excl$env, "complement")
ratings_data_excl$length <- relevel(ratings_data_excl$length, "short")
ratings_data_excl$structure <- relevel(ratings_data_excl$structure, "non-island")

# Add z-scores (by participant) for use later, INCLUDING burn-in trials
ratings_data_excl %>%
  subset(type == "experimental" | type == "filler" | type == "burn-in" | type == "burn-in_filler") %>%
  droplevels %>%
  group_by(prolific_id) %>%
  mutate(z_rating_all = scale(rating)) -> ratings_data_all

# Add z-scores (by participant) for use later, EXCLUDING burn-in trials
ratings_data_all %>%
  subset(type == "experimental" | type == "filler") %>%
  droplevels %>%
  group_by(prolific_id) %>%
  mutate(z_rating = scale(rating)) -> ratings_data

# Get experiment data
ratings_data %>%
  subset(type == "experimental") %>%
  droplevels -> exp_data

# Get filler data
ratings_data %>%
  subset(type == "filler") %>%
  droplevels -> filler_data
```

# Descriptive stats

```{r descriptive_summary}
# Get summary of experiment data
exp_data %>%
  group_by(env,
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary
print(summary)
```

```{r descriptive_plot}
# Make a descriptive plot
summary_forplot <- summary
summary_forplot[7,] <- summary_forplot[5,]
summary_forplot[8,] <- summary_forplot[6,]
summary_forplot$environment <- summary_forplot$env
summary_forplot$environment[5:6] <- c("vt", "vt")
summary_forplot$environment[7:8] <- c("ee", "ee")
summary_forplot$environment <- recode(summary_forplot$environment, vt = "VT", ee = "EE")

summary_forplot %>%
  ggplot(aes(x = length,
             y = mean_rating,
             ymin = 1,
             ymax = 6,
             color = structure,
             group = structure)) +
  facet_grid(.~environment) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) +
  geom_point(aes(col = structure),
             size = 2) +
  scale_y_continuous(breaks = seq(1:6)) +
  scale_color_discrete("Structure",
                       labels = c("Non-island",
                                  "Island")) +
  labs(x = "Length",
       y = "Mean rating") -> plot_summary
print(plot_summary)

# Make an alternative plot
summary_forplot %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Main items") -> plot_summary_collapsed
print(plot_summary_collapsed)
```

## Breakdown by verb

```{r verb_breakdown}
exp_data %>%
  subset(structure == "island") %>%
  group_by(length,
           structure,
           verb, env) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> vbv_summary
print(vbv_summary)

for (vb in 1:length(unique(vbv_summary$verb))) {
  which.verb <- unique(vbv_summary$verb)[vb]
  vbv_summary %>%
    subset(verb == which.verb) %>%
    ggplot(aes(x = length,
               y = mean_rating,
               color = structure,
               group = structure,
               ymin = 1,
               ymax = 6)) +
    scale_y_continuous(breaks = seq(1:6)) +
    geom_point(data = subset(summary,
                             structure == "non-island"),
               aes(x = length,
                   y = mean_rating)) +
    geom_errorbar(data = subset(summary,
                                structure == "non-island"),
                  aes(ymin = mean_rating - se_rating,
                      ymax = mean_rating + se_rating),
                  width = 0.15) +
    geom_point(aes(col = structure),
               size = 2) +
    geom_errorbar(aes(ymin = mean_rating - se_rating,
                      ymax = mean_rating + se_rating),
                  width = 0.15) +
    labs(title = paste0(as.character(which.verb), " (", vbv_summary$env[vbv_summary$verb == which.verb], ")"),
         x = "Length",
         y = "Mean rating") -> plot
  print(plot)
}
```

### DD scores by verb

```{r vbv_dd_scores}
# Save baseline data
exp_data %>%
  subset(structure == "non-island") %>%
  droplevels %>%
  group_by(length) %>%
  summarize(mean_z_rating = mean(z_rating)) -> baseline_z_ratings

# Get DD scores for each verb
exp_data %>%
  subset(structure == "island") %>%
  droplevels %>%
  group_by(verb, env) %>%
  summarize(mean_z_rating = mean(z_rating),
            isl_sh = mean(z_rating[length == "short"]),
            isl_l = mean(z_rating[length == "long"]),
            nisl_sh = baseline_z_ratings$mean_z_rating[1],
            nisl_l = baseline_z_ratings$mean_z_rating[2],
            d1 = nisl_l - isl_l,
            d2 = nisl_sh - isl_sh,
            dd = d1 - d2) -> vbv_dd
print(vbv_dd)

# Plot verbs by DD score
vbv_dd %>%
  subset(select = c(verb, env, dd)) %>%
  ggplot(aes(x = reorder(verb, dd),
             y = dd,
             fill = env)) +
  geom_col() + 
  scale_fill_discrete(name = "Environment",
                      labels = c("VT", "EE")) +
  labs(x = "Verb",
       y = "DD score")
```

## Burn-in items (vs. main items)
```{r burn-in}
ratings_data_excl %>%
  subset(type == "burn-in") %>%
  group_by(env, 
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary_burn_in

print(summary_burn_in)
```

```{r plot_burn_in_summary}
summary_burn_in %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Burn-in items") -> plot_summary_burn_in
print(plot_summary_burn_in)
# Compare
print(plot_summary_collapsed)
```

```{r}
# Considering burn-in and main items together
ratings_data_excl %>%
  subset(type == "experimental" | type == "burn-in") %>%
  droplevels %>%
  group_by(env,
           structure,
           length) %>%
  summarize(mean_rating = mean(rating),
            sd_rating = sd(rating),
            n = n(),
            se_rating = sd_rating/sqrt(n)) -> summary_combined
print(summary_combined)
# Plot
summary_combined %>%
  ggplot(aes(x = length,
             y = mean_rating,
             color = env,
             ymin = 1,
             ymax = 6)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = mean_rating - se_rating,
                    ymax = mean_rating + se_rating),
                width = 0.15) + 
  labs(title = "Combined burn-in and main items") -> plot_summary_combined
print(plot_summary_combined)
```

## Trial order effects
```{r}
ratings_data_excl %>%
  subset(type == "experimental" | type == "burn-in") %>%
  droplevels %>%
  subset(length == "long") %>%
  droplevels -> ratings_data_excl_combined

# Order the data frame by prolific id and trial
ratings_data_excl_combined_ordered <- ratings_data_excl_combined[order(ratings_data_excl_combined$epoch_time,
                                                                       ratings_data_excl_combined$env,
                                                                       ratings_data_excl_combined$trial),]

# Add new col for equalized trial
# Get number of observations per participant
obs <- nrow(ratings_data_excl_combined_ordered[ratings_data_excl_combined_ordered$epoch_time ==  min(ratings_data_excl_combined_ordered$epoch_time),])
ratings_data_excl_combined_ordered$trial_eq <- 1:(obs/3)

# Plot the ratings and draw a line
ratings_data_excl_combined_ordered %>%
  ggplot(aes(x = trial_eq,
             y = z_rating,
             color = env,
             group = env)) +
  geom_jitter() + 
  geom_smooth(method = "lm") -> plot_trial_order
print(plot_trial_order)
```

# CLMs

## Simple effects analysis

```{r}
library(ordinal)

# Make sure nominal variables are coded as factors
exp_data$rating %<>% as.factor
exp_data$env %<>% as.factor
exp_data$structure %<>% as.factor
exp_data$length %<>% as.factor

# Sum code structure and length factors
contrasts(exp_data$structure) <- c(-0.5, 0.5)
contrasts(exp_data$length) <- c(-0.5, 0.5)
# Contrasts for env factor should be dummy-coded correctly by default after reordering the factors

clm(data = exp_data,
    formula = rating ~ env * length) -> clm_analysis
summary(clm_analysis)
```

## Mixed effects analysis
```{r}
# Save the ratings data to run on hb.ucsc.edu
saveRDS(exp_data,
        file = "ratings_data_for_clmm.rds")

# Run the following clmm on the cluster:
# clmm(data = exp_data,
#      formula = rating ~ env * length +
#      (1 + env * length | prolific_id) +
#      (1 + env * length | item)) -> clmm_analysis
# saveRDS(clmm_analysis, "exp6_clmm.rds")
 
# Read in RDS from cluster
clmm_analysis <- readRDS(file = "exp6_clmm.rds")
summary(clmm_analysis)
```

