Rules Study Notebook

Code

library(here)
library(ggplot2)
library(arrow)
library(dplyr)
library(viridis)

server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet"))
date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet"))

# Process first appearances
server_appearance_dates <- server_first_last |>
  rename(date = first_date) |>
  count(date)
server_app_counts <- server_appearance_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process last appearances
server_last_dates <- server_first_last |>
  rename(date = last_date) |>
  count(date)
server_last_counts <- server_last_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Process first rule appearances (excluding NA values)
server_rule_dates <- server_first_last |>
  filter(!is.na(first_rule)) |>
  rename(date = first_rule) |>
  count(date)
server_rule_counts <- server_rule_dates |> 
  arrange(date) |> 
  mutate(count = cumsum(n))

# Create the plot
plots$data.timeline <- ggplot() +
  # Add first appearances area
  geom_area(
    data = server_app_counts,
    aes(x = date, y = count, fill = "Server First Seen"),
    #alpha = 0.6
  ) +
  # Add last appearances area
  geom_area(
    data = server_last_counts,
    aes(x = date, y = count, fill = "Server Last Seen")
  ) +
  # Add first rule appearances area
  geom_area(
    data = server_rule_counts,
    aes(x = date, y = count, fill = "Rules Seen")
  ) +
  # Use viridis color palette
  scale_fill_viridis(
    discrete = TRUE,
    option = "G",
    name = "Server Timeline",
    alpha = 0.5,
    breaks = c("Server First Seen", "Server Last Seen", "Rules Seen")
  ) +
  # Add rug plot for dates without crawls
  geom_rug(
    data = subset(date_has_data, !crawled),
    aes(x = date),
    sides = "b",
    length = unit(0.03, "npc"),
    color = "#000000",
    alpha = 0.5
  ) +
  theme_minimal() +
  labs(
    #title = "Server Timeline: First Appearance, Last Seen, and Rules Added",
    #x = "Date",
    y = "Cumulative Count"
  ) +
  theme(
    #legend.position = "bottom",
    axis.title.x=element_blank(),
    plot.title = element_text(hjust = 0.5),
    panel.grid.minor = element_blank(),
    text = element_text(family = "sans")
  )

plots$data.timeline

Figure 1: Cumulative counts of servers as their metadata appears in the quantitative data.

Code

library(dplyr)
library(arrow)
library(here)
library(ggplot2)
library(tidyr)

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "vector", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

hdbscan <- arrow::read_parquet(here::here("data/rules/processed/clusters_hdbscan.parquet"))

rules_vectors_has_clusters <- rules_vectors |>
  tidyr::drop_na(cluster) |>
  left_join(y = hdbscan, by = "cluster")

# First, convert the list column to a matrix
# Each row will be one vector
vectors_matrix <- do.call(rbind, rules_vectors_has_clusters$vector)

# Perform PCA
pca_result <- prcomp(vectors_matrix, center = TRUE, scale. = TRUE, rank = 2)

# Create dataframe with PCA results and clustering
pca_2d <- data.frame(
  PC1 = pca_result$x[,1],
  PC2 = pca_result$x[,2],
  cluster = rules_vectors_has_clusters$cluster,
  text = rules_vectors_has_clusters$text,
  label = rules_vectors_has_clusters$label
)

plots$pca.clusters <- ggplot(
  (pca_2d |>  filter(cluster >= 0) |> filter(cluster < 10) |> mutate(cluster = as.factor(cluster))),
    aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size=0.1) +
  theme_minimal() +
  labs(title = "PCA of Vector Data") +
  theme(legend.position="none")

plots$pca.clusters

Code

library(arrow)
library(dplyr)
library(here)
library(tinytable)
library(stringr)
library(tidyr)

# TODO: we are missing string "No racism, sexism, homophobia, transphobia, xenophobia, or casteism": solution
# TODO: Handling of languages / multilingual

exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |>
  filter(redirect | short_lived)

rules_vectors <- arrow::read_parquet(
  here::here("data/rules/processed/vectorized_rules_en.parquet"),
  col_select = c("server", "text", "cluster", "representative")
) |>
  filter(!(server %in% exclude_servers$server)) |>
  dplyr::as_tibble()

rule_clusters <- rules_vectors |> tidyr::drop_na(cluster)

most_common_rules <- rule_clusters |>
  count(text) |>
  arrange(desc(n))

top_rules <- inner_join(
  most_common_rules,
  (rules_vectors |> filter(cluster >= 0) |> distinct(text, cluster)),
  by = "text"
) |> arrange(cluster, desc(n)) |>
  distinct(cluster, .keep_all = TRUE) |>
  select(text, cluster)

plots$top.rules <- inner_join(
  top_rules,
  (rule_clusters |> count(cluster) |> rename(Count = n)),
  by = "cluster") |>
    select(text, Count) |>
    arrange(desc(Count)) |>
    head(10) |>
    rename(Rule = text) |>
    tinytable::tt(width = c(0.75, 0.25)) |>
    tinytable::style_tt(align = "lr")

plots$top.rules

Table 1: Most common rules on Fediverse servers.

Rule	Count
No racism, sexism, homophobia, transphobia, xenophobia, or casteism	1925
No incitement of violence or promotion of violent ideologies	1629
No harassment, dogpiling or doxxing of other users	1581
No illegal content.	1273
Sexually explicit or violent media must be marked as sensitive when posting	1220
Do not share intentionally false or misleading information	1074
Be nice.	510
No spam or advertising.	410
Don't be a dick.	405
Be excellent to each other.	264

Code

saveRDS(plots, here::here("data/processed/rules_plots.rds"))

--- title: Rules Study Notebook --- ```{r} #| label: setup #| include: false # load libraries for entire document (not cached) library(here) library(arrow) # load data server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet")) date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet")) plots <- NULL ``` ```{r} #| label: fig-data-timeline #| fig-cap: "Cumulative counts of servers as their metadata appears in the quantitative data." #| fig-asp: 0.3 library(here) library(ggplot2) library(arrow) library(dplyr) library(viridis) server_first_last <- arrow::read_parquet(here("data/rules/plots/server_first_last.parquet")) date_has_data <- arrow::read_parquet(here("data/rules/plots/date_has_data.parquet")) # Process first appearances server_appearance_dates <- server_first_last |> rename(date = first_date) |> count(date) server_app_counts <- server_appearance_dates |> arrange(date) |> mutate(count = cumsum(n)) # Process last appearances server_last_dates <- server_first_last |> rename(date = last_date) |> count(date) server_last_counts <- server_last_dates |> arrange(date) |> mutate(count = cumsum(n)) # Process first rule appearances (excluding NA values) server_rule_dates <- server_first_last |> filter(!is.na(first_rule)) |> rename(date = first_rule) |> count(date) server_rule_counts <- server_rule_dates |> arrange(date) |> mutate(count = cumsum(n)) # Create the plot plots$data.timeline <- ggplot() + # Add first appearances area geom_area( data = server_app_counts, aes(x = date, y = count, fill = "Server First Seen"), #alpha = 0.6 ) + # Add last appearances area geom_area( data = server_last_counts, aes(x = date, y = count, fill = "Server Last Seen") ) + # Add first rule appearances area geom_area( data = server_rule_counts, aes(x = date, y = count, fill = "Rules Seen") ) + # Use viridis color palette scale_fill_viridis( discrete = TRUE, option = "G", name = "Server Timeline", alpha = 0.5, breaks = c("Server First Seen", "Server Last Seen", "Rules Seen") ) + # Add rug plot for dates without crawls geom_rug( data = subset(date_has_data, !crawled), aes(x = date), sides = "b", length = unit(0.03, "npc"), color = "#000000", alpha = 0.5 ) + theme_minimal() + labs( #title = "Server Timeline: First Appearance, Last Seen, and Rules Added", #x = "Date", y = "Cumulative Count" ) + theme( #legend.position = "bottom", axis.title.x=element_blank(), plot.title = element_text(hjust = 0.5), panel.grid.minor = element_blank(), text = element_text(family = "sans") ) plots$data.timeline ``` ```{r} #| label: fig-rules-pca library(dplyr) library(arrow) library(here) library(ggplot2) library(tidyr) exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |> filter(redirect | short_lived) rules_vectors <- arrow::read_parquet( here::here("data/rules/processed/vectorized_rules_en.parquet"), col_select = c("server", "text", "vector", "cluster", "representative") ) |> filter(!(server %in% exclude_servers$server)) |> dplyr::as_tibble() hdbscan <- arrow::read_parquet(here::here("data/rules/processed/clusters_hdbscan.parquet")) rules_vectors_has_clusters <- rules_vectors |> tidyr::drop_na(cluster) |> left_join(y = hdbscan, by = "cluster") # First, convert the list column to a matrix # Each row will be one vector vectors_matrix <- do.call(rbind, rules_vectors_has_clusters$vector) # Perform PCA pca_result <- prcomp(vectors_matrix, center = TRUE, scale. = TRUE, rank = 2) # Create dataframe with PCA results and clustering pca_2d <- data.frame( PC1 = pca_result$x[,1], PC2 = pca_result$x[,2], cluster = rules_vectors_has_clusters$cluster, text = rules_vectors_has_clusters$text, label = rules_vectors_has_clusters$label ) plots$pca.clusters <- ggplot( (pca_2d |> filter(cluster >= 0) |> filter(cluster < 10) |> mutate(cluster = as.factor(cluster))), aes(x = PC1, y = PC2, color = cluster)) + geom_point(alpha = 0.5, size=0.1) + theme_minimal() + labs(title = "PCA of Vector Data") + theme(legend.position="none") plots$pca.clusters ``` ```{r} #| label: tbl-top-rules #| tbl-cap: Most common rules on Fediverse servers. #| cache: false # Cannot cache this due to tinytable/quarto limitations library(arrow) library(dplyr) library(here) library(tinytable) library(stringr) library(tidyr) # TODO: we are missing string "No racism, sexism, homophobia, transphobia, xenophobia, or casteism": solution # TODO: Handling of languages / multilingual exclude_servers <- arrow::read_parquet(here::here("data/rules/processed/server_selections.parquet")) |> filter(redirect | short_lived) rules_vectors <- arrow::read_parquet( here::here("data/rules/processed/vectorized_rules_en.parquet"), col_select = c("server", "text", "cluster", "representative") ) |> filter(!(server %in% exclude_servers$server)) |> dplyr::as_tibble() rule_clusters <- rules_vectors |> tidyr::drop_na(cluster) most_common_rules <- rule_clusters |> count(text) |> arrange(desc(n)) top_rules <- inner_join( most_common_rules, (rules_vectors |> filter(cluster >= 0) |> distinct(text, cluster)), by = "text" ) |> arrange(cluster, desc(n)) |> distinct(cluster, .keep_all = TRUE) |> select(text, cluster) plots$top.rules <- inner_join( top_rules, (rule_clusters |> count(cluster) |> rename(Count = n)), by = "cluster") |> select(text, Count) |> arrange(desc(Count)) |> head(10) |> rename(Rule = text) |> tinytable::tt(width = c(0.75, 0.25)) |> tinytable::style_tt(align = "lr") plots$top.rules ``` ```{r} #| label: save-data saveRDS(plots, here::here("data/processed/rules_plots.rds")) ```