Defederation Notebook

This notebook takes the data from Colglazier, TeBlunthuis, and Shaw (2024) and formats it for easy presentation across formats.

Code

library(tidyverse)
library(here)
library(scales)

source(here::here("code/common.R"))

## plot stuff
quantile_cl <- function(y, q=0.5, conf.level = 0.95, na.rm=TRUE) {
  alpha <- 1 - conf.level
  if (na.rm) y <- y[!is.na(y)]
  n <- length(y)
  l <- qbinom(alpha/2, size=n, prob = q)
  u <- 1 + n - l
  ys <- sort.int(c(-Inf, y, Inf), partial = c(1 + l, 1 + u))
  data.frame(
    y = quantile(y, probs = q, na.rm=na.rm, type = 8),
    ymin = ys[1 + l],
    ymax = ys[1 + u]
  )
}

median_cl <- function(y, conf.level=0.95, na.rm=TRUE) quantile_cl(y, q=0.5, conf.level=conf.level, na.rm=na.rm)

Code

data <- NULL
plots <- NULL

data$metadata <- readRDS(here::here("data/defederation/metadata.rds"))
data$model.alpha.data <- readRDS(here("data/defederation/model_activity_blocked_data.rds")) %>%
  mutate(blocking="Blocked")
data$model.beta.data <- readRDS(here("data/defederation/model_activity_blocking_data.rds")) %>%
  mutate(blocking="Blocking")
data$timeseries.byweek <- readRDS(here("data/defederation/timeseries_byweek.rds"))
data$timeseries.byposttreatment <- readRDS(here("data/defederation/timeseries_byposttreatment.rds"))

data$plotdf <- rbind(
  (data$model.alpha.data %>% select(id, selected, post_treatment, time, blocking, count)),
  (data$model.beta.data %>% select(id, selected, post_treatment, time, blocking, count))
) %>%
  mutate(tgroup=dplyr::case_match(selected, 0 ~ "Matched",1 ~ "Treatment")) %>%
  mutate(treatment=post_treatment)

data$balance.blocked <- readRDS(here("data/defederation/matches_blocked_bal_tab.rds"))
data$balance.blocking <- readRDS(here("data/defederation/matches_blocking_bal_tab.rds"))


as_tibble(data$plotdf)

# A tibble: 21,216 × 8
   id              selected post_treatment  time blocking count tgroup treatment
   <chr>           <lgl>    <lgl>          <int> <chr>    <int> <chr>  <lgl>    
 1 51337076349112… FALSE    FALSE            -12 Blocked     57 Match… FALSE    
 2 51337076349112… FALSE    FALSE            -11 Blocked     32 Match… FALSE    
 3 51337076349112… FALSE    FALSE            -10 Blocked     61 Match… FALSE    
 4 51337076349112… FALSE    FALSE             -9 Blocked     44 Match… FALSE    
 5 51337076349112… FALSE    FALSE             -8 Blocked     20 Match… FALSE    
 6 51337076349112… FALSE    FALSE             -7 Blocked     58 Match… FALSE    
 7 51337076349112… FALSE    FALSE             -6 Blocked     12 Match… FALSE    
 8 51337076349112… FALSE    FALSE             -5 Blocked     29 Match… FALSE    
 9 51337076349112… FALSE    FALSE             -4 Blocked     26 Match… FALSE    
10 51337076349112… FALSE    FALSE             -3 Blocked     48 Match… FALSE    
# ℹ 21,206 more rows

Code

plots$defed.timeline <- data$timeseries.byposttreatment %>%
    filter(selected) %>% filter(post_treatment==FALSE) %>%
    group_by(id) %>% slice(1) %>%
    group_by(block_type, treatment_date) %>% summarise(n = n()) %>% ungroup() %>%
    group_by(block_type) %>% mutate(cs=cumsum(n)) %>%
    ggplot(aes(y=cs, x=treatment_date, fill=block_type)) +
    geom_area(position = "identity", colour = "black", linewidth = .2, alpha=0.5) +
    #scale_fill_manual(values=c("grey80", "grey30")) +
    viridis::scale_fill_viridis(discrete=TRUE, option="mako") +
    theme(legend.title=element_blank()) +
    guides(linetype=guide_legend()) +
    scale_x_date(seq(as.Date("2021-04-01"), as.Date("2022-07-01"), by="6 months"), date_labels =  "%b %Y") +
    theme(axis.text.x=element_text(angle=30, hjust=1), axis.title.x=element_blank()) +
    labs(y="Num. of accounts") +
    theme(legend.position="top")

plots$defed.timeline

Figure 1: **The y-axis shows the cumulative number of blocked and blocking accounts included in our analysis over our study period.**

Code

balance.blocked <- readRDS(here::here("data/defederation/matches_blocked_bal_tab.rds"))
balance.blocking <- readRDS(here::here("data/defederation/matches_blocking_bal_tab.rds"))

balances <- rbind(
  balance.blocked$Balance %>% mutate(group_type="Blocked") %>% rownames_to_column("Covariates"),
  balance.blocking$Balance %>% mutate(group_type="Blocking") %>% rownames_to_column("Covariates")
)
  
plots$love <- balances %>% select(Covariates, group_type, Diff.Un, Diff.Adj) %>%
  gather(key="Adjusted", value="value", Diff.Un, Diff.Adj) %>%
  mutate(
    Adjusted=case_when(Adjusted == "Diff.Un" ~ "Unadjusted", Adjusted == "Diff.Adj" ~ "Adjusted"),
    Covariates=case_when(
      Covariates == "log_count_sum" ~ "Post count",
      Covariates == "l_count_late" ~ "Post count (45)",
      Covariates == "log_replies" ~ "Replies",
      Covariates == "log_out_domains" ~ "Domains",
      Covariates == "log_user_count" ~ "Server accts.",
      Covariates == "treatment_date" ~ "Date",
    )
  ) %>%
  ggplot(aes(x=Covariates, y=value, shape=Adjusted)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
    geom_point() +
    coord_flip() +
    facet_wrap(. ~ group_type, ncol=2) +
    scale_shape_manual(values=c(16, 2)) +
    theme( legend.title=element_blank(), legend.position="top", axis.title.y=element_blank(),
    axis.text.y = element_text(hjust=0),
    panel.spacing = unit(2, "lines"),
    legend.margin=margin(c(1,1,1,1))
    ) +
    ylab("\n Standardized Mean Differences")

plots$love

Figure 2: A covariate balance plot shows the standardized mean difference between treatment and control groups for each measure used in our matching procedure before (unadjusted) and after (adjusted) matching. Our procedure effectively found a group of matched controls similar to the treated accounts along these measures.

Code

plots$ivs <- data$timeseries.byposttreatment %>%
  select(id, selected, post_treatment, toxicity, count) %>%
  mutate("Post count (log)" = log(count), "Toxicity" = toxicity) %>%
  select(id, selected, post_treatment, "Post count (log)", "Toxicity") %>%
  pivot_longer(
    cols = c("Post count (log)", "Toxicity"),
    names_to = "variable",
    values_to = "value"
  ) %>%
  mutate(group = paste(selected, post_treatment)) %>%
  mutate(g = factor(group)) %>%
  ggplot() +
  geom_boxplot(aes(y = value, group = g, fill = g)) + 
  scale_fill_manual(
    values = viridis_pal(alpha = 0.5, option = "viridis")(4),
    labels = c("Control (pre)", "Control (post)", 
               "Treatment (pre)", "Treatment (post)")
  ) +
  facet_wrap(~ variable, scales = "free", ncol = 1) +
  coord_flip() +
  theme(
    legend.title = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank(), 
    axis.title.x = element_blank()
  )
plots$ivs

Figure 3: Box and whisker plots visualize the distributions of our dependent variables within the blocked and blocking groups of user accounts and their matched controls before and after defederation. The lines correspond to the median, the boxes to the inter-quartile range (IQR), the whiskers to the range of the data within 1.5 * IQR, and the dots to data points outside the range of the whiskers.

Code

active_byweek <- data$timeseries.byweek %>%
    mutate(selected=ifelse(selected==TRUE, "Treatment", "Control")) %>%
    group_by(block_type, selected, time) %>%
    summarize(active=sum(count > 0)/n(), .groups="drop") 

data$pdf <- data$plotdf %>%
  group_by(blocking, selected, time) %>% summarize(median=median(count), mean=mean(count), median_cl(count)) 

plots$median.counts <- data$pdf %>%
    mutate(selected=ifelse(selected==TRUE, "Treatment", "Control")) %>%
    ggplot(., aes(x=time, y=median, ymax=ymax, ymin=ymin, group=interaction(selected, blocking))) +
    facet_wrap(. ~ blocking, ncol=1) +
    geom_line(aes(linetype=selected)) +
    geom_ribbon(aes(fill=selected), alpha=0.25) +
    scale_linetype_manual(values = c("Control" = "dotted", "Treatment" = "solid")) +
    geom_vline(xintercept = 0, linetype="dashed", color="black") +
    expand_limits(y = 0) +
    #scale_fill_manual(values=c("grey80", "grey30")) +
    viridis::scale_fill_viridis(discrete=TRUE, option="inferno", direction=-1) +
    theme(legend.title= element_blank(),legend.position="bottom") +
    labs(x="Weeks since treatment", y="Median post count")

plots$median.counts

Code

df_bna <- data$timeseries.byposttreatment

run_wilcox <- function(df, keys) {
    med <- df %>% group_by(post_treatment) %>% summarize(m=median(count)) %>% pull(m) %>% diff
    t <- wilcox.test(count ~ post_treatment, alternative="two.sided", data=df)
    return(data.frame(block_type=keys$block_type, selected=keys$selected, median=med, W=t$statistic, p=t$p.value))
}

run_wilcox_comp <- function(df, keys) {
    med <- df %>% group_by(selected) %>% summarize(m=median(count)) %>% pull(m) %>% diff
    t <- wilcox.test(count ~ selected, alternative="two.sided", data=df)
    return(data.frame(block_type=keys$block_type, median=med, W=t$statistic, p=t$p.value))
}

table1 <- df_bna %>% arrange(block_type, desc(selected)) %>%
    group_by(block_type, selected) %>% group_map(~ run_wilcox(.x, .y)) %>% bind_rows() %>%
    arrange(block_type, desc(selected)) %>% add_column(Group=c("$U_0$", "$C_0$", "$U_1$", "$C_1$")) %>% select(Group, median, W, p)

join_pre_post <- function(df) {
  return(
    inner_join(
      (df %>% filter(post_treatment == TRUE) %>% rename(post_count = count)),
      (df %>% filter(post_treatment == FALSE) %>% rename(pre_count = count)),
      by=c("id", "block_type", "selected")))
}

pre_and_post_counts <- df_bna %>% select(id, block_type, selected, post_treatment, count) %>% as_tibble %>% join_pre_post %>%
  mutate(count = post_count - pre_count)

median_change <- pre_and_post_counts %>% group_by(block_type, selected) %>% summarize(c=median(count), pre=median(pre_count)) %>% mutate(change=c/pre)

table2 <- pre_and_post_counts %>%
  group_by(block_type) %>% group_map(~ run_wilcox_comp(.x, .y)) %>% bind_rows() %>%
  add_column(Group=c("$\\Delta_0$", "$\\Delta_1$")) %>% select(Group, median, W, p)

plots$tbl.wilcox.activity <- rbind(table1, table2) |> 
  tinytable::tt() |>
  #tinytable::format_tt(j = 1, quarto = TRUE) |>
  tinytable::format_tt(j = 4, digits = 3, num_fmt = "decimal", num_zero = T, num_suffix = F) |>
  tinytable::format_tt(j = 2:3, digits = 1, num_fmt = "decimal", num_zero=T) |>
  tinytable::style_tt(i = 5, line = "t", line_color = "black", line_width = 0.1) |>
  tinytable::style_tt(j = 2:4, align = "d")

plots$tbl.wilcox.activity

Group	median	W	p
$U_0$	-135.5	41197.5	0.000
$C_0$	-18.0	35762.0	0.143
$U_1$	-54.5	12413.0	0.122
$C_1$	-53.5	12520.0	0.091
$\Delta_0$	-39.0	39927.0	0.000
$\Delta_1$	3.0	10645.5	0.421

Code

did_activity_df <- readRDS(here::here("data/defederation/table_activity.rds"))
plots$tbl.activity.did <- did_activity_df %>%
  filter(block_type=="Blocked") %>%
  select(-block_type) %>%
  left_join(
    did_activity_df %>% filter(block_type=="Blocking") %>% select(-block_type),
    by = c("Term"="Term")
) %>%
  tinytable::tt() %>%
  setNames(c("Term", "Estimate", "Std. err.", "Low", "High", "Estimate", "Std. err.", "Low", "High")) %>%
  tinytable::group_tt(j = list("Blocked" = 2:5, "Blocking" = 6:9)) %>%
  tinytable::format_tt(digits = 3, num_fmt = "decimal", num_zero = T, num_suffix = F) |>
  tinytable::format_tt(j = 1, quarto = TRUE) |>
  tinytable::style_tt(j = 1, align = "l") |>
  tinytable::theme_tt("resize", width = 1.0)

plots$tbl.activity.did

Table 1

	Blocked				Blocking
Term	Estimate	Std. err.	Low	High	Estimate	Std. err.	Low	High
$\beta_0$ (Intercept)	3.216	0.086	3.049	3.380	2.901	0.118	2.684	3.121
$\beta_1$ Group	-0.024	0.116	-0.255	0.198	0.023	0.177	-0.290	0.367
$\beta_2$ Treatment	-0.089	0.045	-0.178	0.002	-0.080	0.060	-0.201	0.034
$\beta_3$ Time	0.006	0.004	-0.002	0.015	-0.002	0.006	-0.014	0.009
$\beta_4$ Treatment : Time	-0.026	0.006	-0.038	-0.014	-0.027	0.008	-0.042	-0.010
$\beta_5$ Group : Treatment	-0.241	0.065	-0.367	-0.116	-0.084	0.082	-0.247	0.072
$\beta_6$ Group : Time	-0.007	0.006	-0.020	0.004	0.006	0.008	-0.010	0.021
$\beta_7$ Group : Treatment : Time	-0.015	0.009	-0.031	0.002	0.009	0.011	-0.013	0.032

Code

draws.blocked <- readRDS(here::here("data/defederation/draws_activity_blocked.rds")) %>% mutate(block_type="Blocked")
draws.blocking <- readRDS(here::here("data/defederation/draws_activity_blocking.rds")) %>% mutate(block_type="Blocking")
draws <- rbind(draws.blocked, draws.blocking)
plots$fig.activity.did <- draws %>% group_by(block_type, selected, post_treatment, time) %>%
  summarise(median=median(.value), low=quantile(.value, 0.025), high=quantile(.value, 0.975)) %>%
  mutate(selected=ifelse(selected==TRUE, "Treatment", "Control")) %>%
  ggplot(., aes(x=time, y=median, group=interaction(selected, post_treatment))) +
  geom_line(aes(linetype=selected)) +
  geom_ribbon(aes(ymin=low, ymax=high, x=time, fill=selected), alpha=0.25) +
  scale_linetype_manual(values = c("Control" = "dotted", "Treatment" = "solid")) +
  facet_wrap(. ~ block_type,ncol=1) +
  #scale_fill_manual(values=c("grey80", "grey30")) +
  viridis::scale_fill_viridis(discrete=TRUE, option="inferno", direction=-1) +
  theme(legend.title= element_blank(),legend.position="bottom") +
  ylab("Post count (log)") +
  xlab("Weeks since treatment")
plots$fig.activity.did

Figure 5: A marginal effects plot visualizes results from our difference-in-differences analysis of account activity by week. We observe a discontinuous activity decrease among accounts on the blocked server that exceeds any decrease in the matched controls, but no corresponding change among accounts on the blocking server. The bands are 95% credible intervals that account for uncertainty in parameter estimates.

Code

plots$median.toxicity <- data$timeseries.byweek %>% tidyr::drop_na(toxicity) %>%
  group_by(block_type, selected, time) %>%
  summarize(median=median(toxicity), mean=mean(toxicity), median_cl(toxicity)) %>%
    mutate(selected=ifelse(selected==TRUE, "Treatment", "Control")) %>%
    mutate(selected=factor(selected)) %>%
    ggplot(., aes(x=time, y=median, ymax=ymax, ymin=ymin, group=interaction(selected, block_type))) +
    facet_wrap(. ~ block_type, ncol=1) +
    geom_line(aes(linetype=selected)) +
    geom_ribbon(aes(fill=selected), alpha=0.25) +
    scale_linetype_manual(values = c("Control" = "dotted", "Treatment" = "solid"))  +
    geom_vline(xintercept = 0, linetype="dashed", color="black") +
    expand_limits(y = 0) +
    #scale_fill_manual(values=c("grey80", "grey30")) +
    viridis::scale_fill_viridis(discrete=TRUE, option="inferno", direction=-1) +
    theme(legend.title= element_blank(), legend.position='bottom') +
    labs(x="Weeks since treatment", y="Median toxicity")  + scale_y_continuous(breaks=breaks_pretty(n=3))

plots$median.toxicity

Figure 7: Median toxicity among accounts which posted each week for blocked and blocking user accounts. The median toxicity remained flat for all groups.

Code

df_bna <- semi_join(
    (data$timeseries.byposttreatment %>% drop_na(toxicity)),
    (data$timeseries.byposttreatment %>% drop_na(toxicity) %>% group_by(id) %>% count %>% filter(n==2)),
    by="id"
)

run_wilcox <- function(df, keys) {
    med <- df %>% group_by(post_treatment) %>% summarize(m=median(toxicity)) %>% pull(m) %>% diff
    t <- wilcox.test(toxicity ~ post_treatment, alternative="two.sided", data=df)
    return(data.frame(block_type=keys$block_type, selected=keys$selected, median=med, W=t$statistic, p=t$p.value))
}

run_wilcox_comp <- function(df, keys) {
    med <- df %>% group_by(selected) %>% summarize(m=median(toxicity)) %>% pull(m) %>% diff
    t <- wilcox.test(toxicity ~ selected, alternative="two.sided", data=df)
    return(data.frame(block_type=keys$block_type, median=med, W=t$statistic, p=t$p.value))
}

data$table1 <- df_bna %>% arrange(block_type, desc(selected)) %>%
    group_by(block_type, selected) %>% group_map(~ run_wilcox(.x, .y)) %>% bind_rows() %>%
    arrange(block_type, desc(selected)) %>% add_column(Group=c("$U_0$", "$C_0$", "$U_1$", "$C_1$")) %>% select(Group, median, W, p)
table2 <- df_bna %>% arrange(id) %>% group_by(block_type, selected) %>% 
    reframe(toxicity = toxicity[post_treatment == TRUE] - toxicity[post_treatment == FALSE]) %>%
    group_by(block_type) %>% group_map(~ run_wilcox_comp(.x, .y)) %>% bind_rows() %>%
    add_column(Group=c("$\\Delta_0$", "$\\Delta_1$")) %>% select(Group, median, W, p)

plots$tbl.toxicity.wilcoxon <- rbind(data$table1, table2) |> 
  tinytable::tt() |>
  tinytable::format_tt(j = 1, quarto = TRUE) |>
  tinytable::format_tt(j = c(2, 4), digits = 3, num_fmt = "decimal", num_zero = T, num_suffix = F) |>
  tinytable::format_tt(j = 3, digits = 1, num_fmt = "decimal", num_zero=T) |>
  tinytable::style_tt(i = 5, line = "t", line_color = "black", line_width = 0.1) |>
  tinytable::style_tt(j = 2:4, align = "d")

plots$tbl.toxicity.wilcoxon

Table 2

Group	median	W	p
$U_0$	-0.006	17746	0.538
$C_0$	0.004	14000	0.950
$U_1$	-0.008	6514	0.619
$C_1$	0.001	5546	0.873
$\Delta_0$	-0.005	17161	0.072
$\Delta_1$	0.000	6414	0.305

Code

did_toxicity_df <- readRDS(here::here("data/defederation/table_toxicity.rds"))
toxicity_df <- did_toxicity_df %>% select(selected, term, value, sd, .lower, .upper, block_type)
plots$tbl.toxicity.did <- toxicity_df %>%
  filter(block_type=="Blocked") %>%
  select(-block_type) %>%
  left_join(
  toxicity_df %>% filter(block_type=="Blocking") %>% select(-block_type),
    by = c("term", "selected")
  ) %>%
  mutate(selected = ifelse(selected, "Control", "Treatment"))
  
plots$tbl.toxicity.did |>
  tinytable::tt() %>%
  setNames(c("Group", "Term", "Estimate", "Std. err.", "Low", "High", "Estimate", "Std. err.", "Low", "High")) %>%
  tinytable::group_tt(j = list("Blocked" = 2:6, "Blocking" = 6:9)) |>
  tinytable::format_tt(digits = 3, num_fmt = "decimal", num_zero = T, num_suffix = F) |>
  tinytable::format_tt(j = 2, quarto = TRUE) |>
  tinytable::style_tt(j = 3:10, align = "d")

Table 3

	Blocked					Blocking
Group	Term	Estimate	Std. err.	Low	High	Estimate	Std. err.	Low	High
Treatment	$\beta_0$ (Intercept)	-0.143	0.438	-1.197	0.763	-0.042	0.284	-0.754	0.518
Treatment	$\beta_1$ Treatment	0.002	0.006	-0.009	0.018	-0.001	0.010	-0.020	0.017
Treatment	$\beta_2$ Time	0.001	0.001	-0.000	0.002	0.001	0.001	-0.001	0.003
Treatment	$\beta_3$ Treatment : Time	-0.002	0.001	-0.004	0.000	-0.001	0.002	-0.004	0.003
Control	$\beta_4$ (Intercept)	0.136	0.436	-0.907	1.044	0.048	0.283	-0.613	0.649
Control	$\beta_5$ Treatment	-0.000	0.006	-0.016	0.011	0.004	0.009	-0.014	0.024
Control	$\beta_6$ Time	0.001	0.001	-0.000	0.002	0.003	0.001	0.001	0.005
Control	$\beta_7$ Treatment : Time	-0.005	0.001	-0.007	-0.003	-0.006	0.002	-0.009	-0.002

Beta regression coefficients drawn from the posterior of the parametric toxicity DiD model for user accounts whose server was defederated (blocked group) or whose server defederated another (blocking group). For all groups, the 95% credible intervals for a change in toxicity levels after treatment ($\beta_1$, $\beta_5$) contain 0.

Code

saveRDS(data, here::here("data/processed/defederation_data.rds"))
saveRDS(plots, here::here("data/processed/defederation_plots.rds"))

References

Colglazier, Carl, Nathan TeBlunthuis, and Aaron Shaw. 2024. “The Effects of Group Sanctions on Participation and Toxicity: Quasi-experimental Evidence from the Fediverse.” Proceedings of the International AAAI Conference on Web and Social Media 18 (May): 315–28. https://doi.org/10.1609/icwsm.v18i1.31316.