This is a similar analysis to my previous which looked into GitHub use in Africa. I used GitHub Innovation Graph data to see how GitHub public repositories activity is distributed around the world.
Code
library(tidyverse)library(data.table)library(janitor)library(tmap)library(patchwork)library(sf)library(ggiraph)library(viridis) # For scale_fill_viridis_cdevelopers <-fread("https://raw.githubusercontent.com/github/innovationgraph/main/data/developers.csv")git_pushes <-fread("https://raw.githubusercontent.com/github/innovationgraph/main/data/git_pushes.csv")git_repos <-fread("https://raw.githubusercontent.com/github/innovationgraph/main/data/repositories.csv")programming_languages <-fread("https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv")merge_by_devs <-c("iso2_code", "year", "quarter")merge_fun_devs <-function(devs, df, by.vars = merge_by_devs) {merge(devs, df, by = by.vars)}git_pushes <-merge_fun_devs(developers, git_pushes)git_repos <-merge_fun_devs(developers, git_repos)programming_languages <-merge_fun_devs(developers, programming_languages)github_data_name <-c("developers", "git_pushes", "git_repos", "programming_languages")iso3 <-fread("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv") iso3 <- iso3[, .(iso2_code =`alpha-2`, iso_a3 =`alpha-3`)]data("World") # from tmapafrica <- World %>%left_join(iso3,by ="iso_a3")# %>%#filter(continent == "Africa")setDT(africa)get_latest_year_and_quarter <-function(df, year_column, quarter_column, concactenate =TRUE) {# Calculate the latest year with available data latest_year <-max(df[[year_column]], na.rm =TRUE)# Calculate the latest quarter for the latest year latest_quarter <-max(df[df[[year_column]] == latest_year][[quarter_column]], na.rm =TRUE)# Check if the user wants to return a concatenated contact stringif (concactenate) {# Return the concatenated string of latest year and quarterreturn(paste0(latest_year, "-", latest_quarter)) } else {# Return a named list containing the latest year and quarterreturn(list(latest_year = latest_year, latest_quarter = latest_quarter)) }}# Define the functionprocess_df_to_sf <-function(df, spatial_df,iso2_column_name ="iso2_code", year_column ="year",quarter_column ="quarter",pop_est_column ="pop_est", freq_column ="developers",geometry_column ="geometry",group_col =NULL,per =100000,round_digits =0) {# Convert df to a data.tablesetDT(df)# Filter for the latest year and quarter latest_year_quarter<-get_latest_year_and_quarter(df, year_column, quarter_column, concactenate =FALSE) latest_year <-latest_year_quarter$latest_year # max(df[[year_column]], na.rm = TRUE) latest_quarter <- latest_year_quarter$latest_quarter # max(df[get(year_column) == latest_year][[quarter_column]], na.rm = TRUE) get#print(latest_year)#print(latest_quarter) latest_df <- df[get(year_column) == latest_year &get(quarter_column) == latest_quarter]# print(head(latest_df))# Join with the spatial data frameif(!is.null(group_col)) { latest_df <- latest_df[!is.na(get(group_col)),] dfs =split.data.frame(latest_df, f = latest_df[[group_col]])#cat("length dfs:", length(dfs))## right join by spacial df and rbindlist latest_df_joined =rbindlist(lapply(dfs, function(x) { uniqu_grp_col =unique(x[[group_col]]) x = x %>%merge(spatial_df, by = iso2_column_name, all.y =TRUE) x[, (group_col) :=uniqu_grp_col] x })) #print(head(latest_df_joined)) }else{ latest_df_joined <- latest_df %>%merge(spatial_df, by = iso2_column_name, all.y =TRUE) } latest_df_joined[is.na(get(freq_column)), (freq_column) :=0]#latest_df_joined[is.na(get(pop_est_column)), (pop_est_column) := 0] latest_df_joined[, devs_per_100k :=round(.SD[[freq_column]] / .SD[[pop_est_column]] * per, digits = round_digits), .SDcols =c(freq_column, pop_est_column)] latest_df_joined[is.na(devs_per_100k), devs_per_100k :=0]# Ensure spatial_df is a data.frame or data.table with a geometry columnif(!"geometry"%in%colnames(spatial_df)) {stop("The spatial data frame must have a geometry column.") }# Convert to sf object, ensuring the geometry column is specified correctly latest_df_sf <-st_set_geometry(latest_df_joined, value = geometry_column)# Return the sf objectreturn(latest_df_sf)}# Define the function with tidy evaluation for aes()create_interactive_map <-function(data_sf, fill_var, tooltip_var,facet_var =NULL,plot_title ="Developers per 100,000 people",return_girafe =TRUE,aspect_ratio =1,point_size =9) {# Create the ggplot p <-ggplot(data_sf, aes(fill = {{fill_var}}, tooltip =paste0({{tooltip_var}}, ":", {{fill_var}}) )) +geom_sf_interactive() +scale_fill_viridis_c() +theme_void() +theme(legend.position ="left",legend.key.width =unit(0.1, 'cm'),plot.title =element_text(size =11),aspect.ratio = aspect_ratio,plot.margin =unit(c(0,0,0,0), "cm"))+labs(title = plot_title, fill ="")# Dynamically add facets if a facet_var is providedif (!is.null(facet_var)) {# Use the .data pronoun to refer to variables for faceting p <- p +facet_wrap(vars(.data[[facet_var]])) }# Generate the interactive mapif (return_girafe){ girafe_code <-girafe(ggobj = p, pointsize = point_size)return(girafe_code) } else {return(p) }}# Example usage:# Assuming 'latest_developers_sf' is your sf dataframe and has 'devs_per_100k' for fill and 'name' for tooltip# create_interactive_map(latest_developers_sf, fill_var = devs_per_100k, tooltip_var = paste0(name, ":", devs_per_100k))# To add facets, simply add the facet_var argument like facet_var = "region"library(ggplot2)library(ggiraph)library(stringr) # For str_wrapplot_interactive_bar <-function(data, x_var, y_var, fill_var, title ="Top programming languages by country",xlab ="Country", ylab ="% Number of pushers") {# Create the plot top_countries_plot <-ggplot(data, aes(x = {{x_var}}, y = {{y_var}}, fill = {{fill_var}},tooltip =paste0({{x_var}}, ", ", {{fill_var}}, " ", {{y_var}}))) +geom_bar_interactive(stat ="identity", position ="dodge", width =0.5) +theme_minimal() +labs(title = title,fill ="",y = ylab,x = xlab) +scale_x_discrete(labels =function(x) str_wrap(x, width =10)) +scale_fill_brewer(type ="qual", palette ="Dark2") +theme(legend.position ="bottom")# Return the interactive plotgirafe(ggobj = top_countries_plot, pointsize =9)}## donhut chartdonhunt_chart <-function(data, labelColumn, valueColumn, chartTitle) {plot_ly(data, labels =~get(labelColumn), values =~get(valueColumn)) %>%add_pie(hole =0.6) %>%layout(title = chartTitle,xaxis =list(showgrid =FALSE, zeroline =FALSE, showticklabels =FALSE),yaxis =list(showgrid =FALSE, zeroline =FALSE, showticklabels =FALSE))}
Top 10 programming languages by the number pushers
latest_developers_sf_top <-top_n(latest_developers_sf, 10, devs_per_100k)programming_languages_sf <-process_df_to_sf(programming_languages,spatial_df = africa,iso2_column_name ="iso2_code",freq_column ="num_pushers", pop_est_column ="developers",group_col ="language",per =100,round_digits =2)programming_languages_dt <-as.data.table(programming_languages_sf)programming_languages_dt[, geomety :=NULL]programming_languages_dt[, name :=as.character(name)]## top 10 programming languages by the number of pusherstop_10 <- programming_languages_dt[, .(num_pushers =sum(num_pushers)), by = language][order(-num_pushers), .SD[1:15]]library(plotly)#plot pie chart for top_10top_lan <-donhunt_chart(data = top_10, labelColumn ="language", valueColumn ="num_pushers", chartTitle ="Top 15 Programming Languages by number of pushers in the World")htmltools ::div(top_lan, align ="center")
Top 3 programming languages
Top 3 programming languages by the number pushers in each country
Presents only top ten countries by the number of developers per 100,000 people
number_pushers_lan <- programming_languages_dt[ iso2_code %in% latest_developers_sf_top$iso2_code]number_pushers_lan <- number_pushers_lan[devs_per_100k !=0,]## get top 3 languages with highest number of developers per 100k per countrytop3_lan_per_country <- number_pushers_lan[order(-devs_per_100k), .SD[1:3], by = name]plot_interactive_bar(data = top3_lan_per_country,x_var = name,y_var = devs_per_100k,fill_var = language,title ="Top 3 languages with highest % of developers that pushed code to GitHub",)
Top 6 Programming Languages in World: Percentage of developers distribution pushing code to GitHub
# Merge the programming languages data with the spatial datasetorder(top_10, -num_pushers)programming_languages_sf_top6 <- programming_languages_sf %>%filter(language %in% top_10$language[1:6])create_interactive_map(data_sf = programming_languages_sf_top6,fill_var = devs_per_100k,tooltip_var = name,facet_var ="language",plot_title ="Distribution of top 6 programming languages in World (% of developers that pushed code to GitHub)")
A look at R: Used for analytics and statistics
Analytics/Data Science
Python presented on the map before this
#"R", "Python", "Java",filter_sf_obj <-function(sf_obj, languages) { sf_obj %>%filter(language %in% languages)}r_pyh <-filter_sf_obj(programming_languages_sf, languages =c("R"))create_interactive_map(data_sf =r_pyh ,fill_var = num_pushers,tooltip_var = name,#facet_var = "language",plot_title ="Distribution of R users in World")
Number of pushers; A look at Java & Swift : Languages as a proxy for mobile app development
#"R", "Python", "Java",java_swift <-filter_sf_obj(programming_languages_sf,languages =c("Java", "Swift"))create_interactive_map(data_sf =java_swift ,fill_var = num_pushers,tooltip_var = name,facet_var ="language",plot_title ="Countries leading in Java or Swift")