Federating the Social Web
  • Dissertation
  • Prospectus
  • Notebooks
    • Defederation
    • Newcomers
    • Rules
    • Other
  1. Newcomers Plots
  • Defederation Notebook
  • Newcomers Plots
  • Catch-all Notebook
  • Rules Study
    • Rules Study Notebook
    • Rules Memos

Newcomers Plots

  • Show All Code
  • Hide All Code

  • View Source
Code
library(here)

source(here("code/survival.R"))
plots <- NULL
plots$survival <- plot_km
Code
account_timeline_plot <- function() {
jm <- arrow::read_feather(here("data/newcomers/scratch/joinmastodon.feather"))
moved_to <- arrow::read_feather(here("data/newcomers/scratch/individual_moved_accounts.feather"))
accounts_unfilt <- arrow::read_feather(
  here("data/newcomers/scratch/all_accounts.feather"),
  col_select=c(
    "server", "username", "created_at", "last_status_at",
    "statuses_count", "has_moved", "bot", "suspended",
    "following_count", "followers_count", "locked",
    "noindex", "group", "discoverable"
  ))
accounts <- accounts_unfilt %>%
  filter(!bot) %>%
  # TODO: what's going on here?
  filter(!is.na(last_status_at)) %>%
  mutate(suspended = replace_na(suspended, FALSE)) %>%
  # sanity check
  filter(created_at >= "2020-10-01") %>%
  #filter(created_at < "2024-01-01") %>%
  filter(created_at < "2023-08-15") %>%
  # We don't want accounts that were created and then immediately stopped being active
  filter(statuses_count >= 1) %>%
  filter(last_status_at >= created_at) %>%
  mutate(active = last_status_at >= "2024-01-01") %>%
  mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
  mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
  #filter(!has_moved)
acc_data <- accounts %>%
  #filter(!has_moved) %>%
  mutate(created_month = format(created_at, "%Y-%m")) %>%
  mutate(created_week = floor_date(created_at, unit = "week")) %>%
  mutate(active_now = active) %>%
  mutate(active = active_time >= 91) %>%
  mutate("Is mastodon.social" = server == "mastodon.social") %>%
  mutate(jm = server %in% jm$domain) %>%
  group_by(created_week) %>%
  summarize(
    `JoinMastodon Server` = sum(jm) / n(),
    `Is mastodon.social` = sum(`Is mastodon.social`)/n(),
    Suspended = sum(suspended)/n(),
    Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
    active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
    Moved=sum(has_moved)/n(),
    count=n()) %>%
  pivot_longer(cols=c("JoinMastodon Server", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"

p1 <- acc_data %>%
  ggplot(aes(x=as.Date(created_week), group=1)) +
  geom_line(aes(y=value, group=Measure, color=Measure)) +
  geom_point(aes(y=value, color=Measure), size=0.7) +
  scale_y_continuous(limits = c(0, 1.0)) +
  labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") +
  theme_bw_small_labels() +
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
p2 <- acc_data %>%
  distinct(created_week, count) %>%
  ggplot(aes(x=as.Date(created_week), y=count)) +
  geom_bar(stat="identity", fill="black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-10-27"))),
    linetype="dashed", color = "black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-04-14"))),
    linetype="dashed", color = "black") +
  # https://twitter.com/elonmusk/status/1675187969420828672
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-12-15"))),
    linetype="dashed", color = "black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2023-07-01"))),
    linetype="dashed", color = "black") +
  #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
  scale_y_continuous(labels = scales::comma) + 
  labs(y="Count", x="Created Week") +
  theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week")
return(p1 + p2 + plot_layout(ncol = 1, guides = "collect"))
}
plots$timeline <- account_timeline_plot() |>
  print()
Figure 1
Code
library(ergm)
library(tinytable)
library(modelsummary)

load(file = here("data/newcomers/scratch/ergm-model-early.rda"))
load(file = here("data/newcomers/scratch/ergm-model-late.rda"))

plots$ergm.table <- modelsummary::modelsummary(
  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
  statistic = NULL,
  gof_omit = ".*",
  coef_rename = c(
    "sum" = "(Sum)",
    "diff.sum0.h-t.accounts" = "Smaller server",
    "nodeocov.sum.accounts" = "Server size (outgoing)",
    "nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)",
    "nodematch.sum.language" = "Languages match"
  ),
  align="lrrrr",
  stars = c('*' = .05, '**' = 0.01, '***' = .001),
  ) %>%
    tinytable::group_tt(j = list("Model A" = 2:3, "Model B" = 4:5))

plots$ergm.table 
Table 1: ERGM model output
Model A Model B
Coef. Std.Error Coef. Std.Error
(Sum) -9.529 ***0.188 -10.268 ***0.718
nonzero -3.577 ***0.083 -2.861 ***0.254
Smaller server 0.709 ***0.032 0.629 ***0.082
Server size (outgoing) 0.686 ***0.013 0.655 ***0.042
Open registrations (incoming) 0.168 ***0.046 -0.250 0.186
Languages match 0.044 0.065 0.589 0.392
Code
library(ehahelper)
library(broom)

cxme_table <- tidy(cxme) %>%
  mutate(conf.low = exp(conf.low), conf.high=exp(conf.high)) %>%
  mutate(term = case_when(
    term == "factor(group)1" ~ "Join Mastodon",
    term == "factor(group)2" ~ "General Servers",
    term == "small_serverTRUE" ~ "Small Server",
    TRUE ~ term
  )) %>%
  #mutate(exp.coef = paste("(", round(conf.low, 2), ", ", round(conf.high, 2), ")", sep="")) %>%
  select(term, estimate, conf.low, conf.high, p.value) |>
  rename("Estimate" = "estimate", "Low" = "conf.low", "High" = "conf.high", "p-value" = "p.value", "Term" = "term")

plots$coxme.table <- cxme_table |>
  tinytable::tt() |>
  tinytable::style_tt(align = "ldddd") |>
  tinytable::format_tt(i = 1:3, j = 2:4, digits = 3, num_fmt = "significant", num_suffix = F) |>
  tinytable::format_tt(i = 1:3, j = 5, digits = 1, num_fmt = "significant", num_suffix = F) |>
  print()
Table 2

+-----------------+----------+-------+-------+---------+
| Term            | Estimate | Low   | High  | p-value |
+=================+==========+=======+=======+=========+
| Join Mastodon   | 0.115    | 0.972 | 1.296 | 0.117   |
+-----------------+----------+-------+-------+---------+
| General Servers | 0.385    | 1.071 | 2.015 | 0.017   |
+-----------------+----------+-------+-------+---------+
| Small Server    | -0.245   | 0.664 | 0.922 | 0.003   |
+-----------------+----------+-------+-------+---------+ 
Code
saveRDS(plots, here::here("data/processed/newcomers_plots.rds"))
Source Code
---
title: "Newcomers Plots"
---

```{r}
library(here)

source(here("code/survival.R"))
plots <- NULL
plots$survival <- plot_km
```

```{r}
#| label: fig-newcomer-timeline
account_timeline_plot <- function() {
jm <- arrow::read_feather(here("data/newcomers/scratch/joinmastodon.feather"))
moved_to <- arrow::read_feather(here("data/newcomers/scratch/individual_moved_accounts.feather"))
accounts_unfilt <- arrow::read_feather(
  here("data/newcomers/scratch/all_accounts.feather"),
  col_select=c(
    "server", "username", "created_at", "last_status_at",
    "statuses_count", "has_moved", "bot", "suspended",
    "following_count", "followers_count", "locked",
    "noindex", "group", "discoverable"
  ))
accounts <- accounts_unfilt %>%
  filter(!bot) %>%
  # TODO: what's going on here?
  filter(!is.na(last_status_at)) %>%
  mutate(suspended = replace_na(suspended, FALSE)) %>%
  # sanity check
  filter(created_at >= "2020-10-01") %>%
  #filter(created_at < "2024-01-01") %>%
  filter(created_at < "2023-08-15") %>%
  # We don't want accounts that were created and then immediately stopped being active
  filter(statuses_count >= 1) %>%
  filter(last_status_at >= created_at) %>%
  mutate(active = last_status_at >= "2024-01-01") %>%
  mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
  mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
  #filter(!has_moved)
acc_data <- accounts %>%
  #filter(!has_moved) %>%
  mutate(created_month = format(created_at, "%Y-%m")) %>%
  mutate(created_week = floor_date(created_at, unit = "week")) %>%
  mutate(active_now = active) %>%
  mutate(active = active_time >= 91) %>%
  mutate("Is mastodon.social" = server == "mastodon.social") %>%
  mutate(jm = server %in% jm$domain) %>%
  group_by(created_week) %>%
  summarize(
    `JoinMastodon Server` = sum(jm) / n(),
    `Is mastodon.social` = sum(`Is mastodon.social`)/n(),
    Suspended = sum(suspended)/n(),
    Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
    active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
    Moved=sum(has_moved)/n(),
    count=n()) %>%
  pivot_longer(cols=c("JoinMastodon Server", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"

p1 <- acc_data %>%
  ggplot(aes(x=as.Date(created_week), group=1)) +
  geom_line(aes(y=value, group=Measure, color=Measure)) +
  geom_point(aes(y=value, color=Measure), size=0.7) +
  scale_y_continuous(limits = c(0, 1.0)) +
  labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") +
  theme_bw_small_labels() +
  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
p2 <- acc_data %>%
  distinct(created_week, count) %>%
  ggplot(aes(x=as.Date(created_week), y=count)) +
  geom_bar(stat="identity", fill="black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-10-27"))),
    linetype="dashed", color = "black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-04-14"))),
    linetype="dashed", color = "black") +
  # https://twitter.com/elonmusk/status/1675187969420828672
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2022-12-15"))),
    linetype="dashed", color = "black") +
  geom_vline(
    aes(xintercept = as.numeric(as.Date("2023-07-01"))),
    linetype="dashed", color = "black") +
  #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
  scale_y_continuous(labels = scales::comma) + 
  labs(y="Count", x="Created Week") +
  theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week")
return(p1 + p2 + plot_layout(ncol = 1, guides = "collect"))
}
plots$timeline <- account_timeline_plot() |>
  print()
```

```{r}
#| label: tbl-ergm
#| tbl-cap: ERGM model output

library(ergm)
library(tinytable)
library(modelsummary)

load(file = here("data/newcomers/scratch/ergm-model-early.rda"))
load(file = here("data/newcomers/scratch/ergm-model-late.rda"))

plots$ergm.table <- modelsummary::modelsummary(
  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
  statistic = NULL,
  gof_omit = ".*",
  coef_rename = c(
    "sum" = "(Sum)",
    "diff.sum0.h-t.accounts" = "Smaller server",
    "nodeocov.sum.accounts" = "Server size (outgoing)",
    "nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)",
    "nodematch.sum.language" = "Languages match"
  ),
  align="lrrrr",
  stars = c('*' = .05, '**' = 0.01, '***' = .001),
  ) %>%
    tinytable::group_tt(j = list("Model A" = 2:3, "Model B" = 4:5))

plots$ergm.table 
```

```{r}
#| label: tbl-coxme
#| fig-cap: "Cox Proportional Hazard Model with Mixed Effects. The model includes a random effect for the server."
library(ehahelper)
library(broom)

cxme_table <- tidy(cxme) %>%
  mutate(conf.low = exp(conf.low), conf.high=exp(conf.high)) %>%
  mutate(term = case_when(
    term == "factor(group)1" ~ "Join Mastodon",
    term == "factor(group)2" ~ "General Servers",
    term == "small_serverTRUE" ~ "Small Server",
    TRUE ~ term
  )) %>%
  #mutate(exp.coef = paste("(", round(conf.low, 2), ", ", round(conf.high, 2), ")", sep="")) %>%
  select(term, estimate, conf.low, conf.high, p.value) |>
  rename("Estimate" = "estimate", "Low" = "conf.low", "High" = "conf.high", "p-value" = "p.value", "Term" = "term")

plots$coxme.table <- cxme_table |>
  tinytable::tt() |>
  tinytable::style_tt(align = "ldddd") |>
  tinytable::format_tt(i = 1:3, j = 2:4, digits = 3, num_fmt = "significant", num_suffix = F) |>
  tinytable::format_tt(i = 1:3, j = 5, digits = 1, num_fmt = "significant", num_suffix = F) |>
  print()
```


```{r}
#| label: save-data

saveRDS(plots, here::here("data/processed/newcomers_plots.rds"))
```