Federating the Social Web
  • Dissertation
  • Prospectus
  • Notebooks
    • Defederation
    • Newcomers
    • Rules
    • Other
  1. Rules Study
  2. Rules Study Notebook
  • Defederation Notebook
  • Newcomers Plots
  • Catch-all Notebook
  • Rules Study
    • Rules Study Notebook
    • Rules Memos
  1. Rules Study
  2. Rules Study Notebook

Rules Study Notebook

  • Show All Code
  • Hide All Code

  • View Source
Code
library(here)
library(ggplot2)
library(arrow)
library(dplyr)
library(viridis)

server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))

# Process first appearances
server_appearance_dates <- server_first_last |>
  rename(date = first_date) |>
  count(date)
server_app_counts <- server_appearance_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process last appearances
server_last_dates <- server_first_last |>
  rename(date = last_date) |>
  count(date)
server_last_counts <- server_last_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process first rule appearances (excluding NA values)
server_rule_dates <- server_first_last |>
  filter(!is.na(first_rule)) |>
  rename(date = first_rule) |>
  count(date)
server_rule_counts <- server_rule_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Create the plot
plots$data.timeline <- ggplot() +
  # Add first appearances area
  geom_area(
    data = server_app_counts,
    aes(x = date, y = count, fill = "Server First Seen"),
    #alpha = 0.6
  ) +
  # Add last appearances area
  geom_area(
    data = server_last_counts,
    aes(x = date, y = count, fill = "Server Last Seen")
  ) +
  # Add first rule appearances area
  geom_area(
    data = server_rule_counts,
    aes(x = date, y = count, fill = "Rules Seen")
  ) +
  # Use viridis color palette
  scale_fill_viridis(
    discrete = TRUE,
    option = "G",
    name = "Server Timeline",
    alpha = 0.5,
    breaks = c("Server First Seen", "Server Last Seen", "Rules Seen")
  ) +
  # Add rug plot for dates without crawls
  geom_rug(
    data = subset(date_has_data, !crawled),
    aes(x = date),
    sides = "b",
    length = unit(0.03, "npc"),
    color = "#000000",
    alpha = 0.5
  ) +
  theme_minimal() +
  labs(
    #title = "Server Timeline: First Appearance, Last Seen, and Rules Added",
    #x = "Date",
    y = "Cumulative Count"
  ) +
  theme(
    #legend.position = "bottom",
    axis.title.x=element_blank(),
    plot.title = element_text(hjust = 0.5),
    panel.grid.minor = element_blank(),
    text = element_text(family = "sans")
  )

plots$data.timeline
Figure 1: Cumulative counts of servers as their metadata appears in the quantitative data.
Code
library(dplyr)
library(arrow)
library(here)
library(ggplot2)
library(tidyr)

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "vector", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

hdbscan <- arrow::read_parquet(here::here("data/rules/processed/clusters_hdbscan.parquet"))

rules_vectors_has_clusters <- rules_vectors |>
  tidyr::drop_na(cluster) |>
  left_join(y = hdbscan, by = "cluster")

# First, convert the list column to a matrix
# Each row will be one vector
vectors_matrix <- do.call(rbind, rules_vectors_has_clusters$vector)

# Perform PCA
pca_result <- prcomp(vectors_matrix, center = TRUE, scale. = TRUE, rank = 2)

# Create dataframe with PCA results and clustering
pca_2d <- data.frame(
  PC1 = pca_result$x[,1],
  PC2 = pca_result$x[,2],
  cluster = rules_vectors_has_clusters$cluster,
  text = rules_vectors_has_clusters$text,
  label = rules_vectors_has_clusters$label
)

plots$pca.clusters <- ggplot(
  (pca_2d |>  filter(cluster >= 0) |> filter(cluster < 10) |> mutate(cluster = as.factor(cluster))),
    aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size=0.1) +
  theme_minimal() +
  labs(title = "PCA of Vector Data") +
  theme(legend.position="none")

plots$pca.clusters
Figure 2
Code
library(arrow)
library(dplyr)
library(here)
library(tinytable)
library(stringr)
library(tidyr)

# TODO: we are missing string "No racism, sexism, homophobia, transphobia, xenophobia, or casteism": solution
# TODO: Handling of languages / multilingual

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

rule_clusters <- rules_vectors |> tidyr::drop_na(cluster)

most_common_rules <- rule_clusters |>
  count(text) |>
  arrange(desc(n))

top_rules <- inner_join(
  most_common_rules,
  (rules_vectors |> filter(cluster >= 0) |> distinct(text, cluster)),
  by = "text"
) |> arrange(cluster, desc(n)) |>
  distinct(cluster, .keep_all = TRUE) |>
  select(text, cluster)

plots$top.rules <- inner_join(
  top_rules,
  (rule_clusters |> count(cluster) |> rename(Count = n)),
  by = "cluster") |>
    select(text, Count) |>
    arrange(desc(Count)) |>
    head(10) |>
    rename(Rule = text) |>
    tinytable::tt(width = c(0.75, 0.25)) |>
    tinytable::style_tt(align = "lr")

plots$top.rules
Table 1: Most common rules on Fediverse servers.
Rule Count
No racism, sexism, homophobia, transphobia, xenophobia, or casteism 1925
No incitement of violence or promotion of violent ideologies 1629
No harassment, dogpiling or doxxing of other users 1581
No illegal content. 1273
Sexually explicit or violent media must be marked as sensitive when posting 1220
Do not share intentionally false or misleading information 1074
Be nice. 510
No spam or advertising. 410
Don't be a dick. 405
Be excellent to each other. 264
Code
saveRDS(plots, here::here("data/processed/rules_plots.rds"))
Source Code
---
title: Rules Study Notebook
---


```{r}
#| label: setup
#| include: false

# load libraries for entire document (not cached)
library(here)
library(arrow)

# load data
server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))

plots <- NULL
```

```{r}
#| label: fig-data-timeline
#| fig-cap: "Cumulative counts of servers as their metadata appears in the quantitative data."
#| fig-asp: 0.3
library(here)
library(ggplot2)
library(arrow)
library(dplyr)
library(viridis)

server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))

# Process first appearances
server_appearance_dates <- server_first_last |>
  rename(date = first_date) |>
  count(date)
server_app_counts <- server_appearance_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process last appearances
server_last_dates <- server_first_last |>
  rename(date = last_date) |>
  count(date)
server_last_counts <- server_last_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process first rule appearances (excluding NA values)
server_rule_dates <- server_first_last |>
  filter(!is.na(first_rule)) |>
  rename(date = first_rule) |>
  count(date)
server_rule_counts <- server_rule_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Create the plot
plots$data.timeline <- ggplot() +
  # Add first appearances area
  geom_area(
    data = server_app_counts,
    aes(x = date, y = count, fill = "Server First Seen"),
    #alpha = 0.6
  ) +
  # Add last appearances area
  geom_area(
    data = server_last_counts,
    aes(x = date, y = count, fill = "Server Last Seen")
  ) +
  # Add first rule appearances area
  geom_area(
    data = server_rule_counts,
    aes(x = date, y = count, fill = "Rules Seen")
  ) +
  # Use viridis color palette
  scale_fill_viridis(
    discrete = TRUE,
    option = "G",
    name = "Server Timeline",
    alpha = 0.5,
    breaks = c("Server First Seen", "Server Last Seen", "Rules Seen")
  ) +
  # Add rug plot for dates without crawls
  geom_rug(
    data = subset(date_has_data, !crawled),
    aes(x = date),
    sides = "b",
    length = unit(0.03, "npc"),
    color = "#000000",
    alpha = 0.5
  ) +
  theme_minimal() +
  labs(
    #title = "Server Timeline: First Appearance, Last Seen, and Rules Added",
    #x = "Date",
    y = "Cumulative Count"
  ) +
  theme(
    #legend.position = "bottom",
    axis.title.x=element_blank(),
    plot.title = element_text(hjust = 0.5),
    panel.grid.minor = element_blank(),
    text = element_text(family = "sans")
  )

plots$data.timeline
```


```{r}
#| label: fig-rules-pca
library(dplyr)
library(arrow)
library(here)
library(ggplot2)
library(tidyr)

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "vector", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

hdbscan <- arrow::read_parquet(here::here("data/rules/processed/clusters_hdbscan.parquet"))

rules_vectors_has_clusters <- rules_vectors |>
  tidyr::drop_na(cluster) |>
  left_join(y = hdbscan, by = "cluster")

# First, convert the list column to a matrix
# Each row will be one vector
vectors_matrix <- do.call(rbind, rules_vectors_has_clusters$vector)

# Perform PCA
pca_result <- prcomp(vectors_matrix, center = TRUE, scale. = TRUE, rank = 2)

# Create dataframe with PCA results and clustering
pca_2d <- data.frame(
  PC1 = pca_result$x[,1],
  PC2 = pca_result$x[,2],
  cluster = rules_vectors_has_clusters$cluster,
  text = rules_vectors_has_clusters$text,
  label = rules_vectors_has_clusters$label
)

plots$pca.clusters <- ggplot(
  (pca_2d |>  filter(cluster >= 0) |> filter(cluster < 10) |> mutate(cluster = as.factor(cluster))),
    aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size=0.1) +
  theme_minimal() +
  labs(title = "PCA of Vector Data") +
  theme(legend.position="none")

plots$pca.clusters
```


```{r}
#| label: tbl-top-rules
#| tbl-cap: Most common rules on Fediverse servers.
#| cache: false # Cannot cache this due to tinytable/quarto limitations

library(arrow)
library(dplyr)
library(here)
library(tinytable)
library(stringr)
library(tidyr)

# TODO: we are missing string "No racism, sexism, homophobia, transphobia, xenophobia, or casteism": solution
# TODO: Handling of languages / multilingual

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

rule_clusters <- rules_vectors |> tidyr::drop_na(cluster)

most_common_rules <- rule_clusters |>
  count(text) |>
  arrange(desc(n))

top_rules <- inner_join(
  most_common_rules,
  (rules_vectors |> filter(cluster >= 0) |> distinct(text, cluster)),
  by = "text"
) |> arrange(cluster, desc(n)) |>
  distinct(cluster, .keep_all = TRUE) |>
  select(text, cluster)

plots$top.rules <- inner_join(
  top_rules,
  (rule_clusters |> count(cluster) |> rename(Count = n)),
  by = "cluster") |>
    select(text, Count) |>
    arrange(desc(Count)) |>
    head(10) |>
    rename(Rule = text) |>
    tinytable::tt(width = c(0.75, 0.25)) |>
    tinytable::style_tt(align = "lr")

plots$top.rules
```

```{r}
#| label: save-data

saveRDS(plots, here::here("data/processed/rules_plots.rds"))
```