library("dplyr")
library("Seurat")
library("knitr")
library("ggplot2")
library("BiocManager")
library("here")
#BiocManager::install("EnhancedVolcano")
library("EnhancedVolcano") #volcano plot
#install.packages('DESeq2') #for DEG
library("DESeq2")
library("tidyverse") #tidy up data
library("styler") #tidy up data


if (!require("kableExtra")) {install.packages("kableExtra"); require("kableExtra")} # for color brewer
if (!require("RColorBrewer")) {install.packages("RColorBrewer"); require("RColorBrewer")} # for color brewer
if (!require("sctransform")) {install.packages("sctransform"); require("sctransform")} # for data normalization
if (!require("glmGamPoi")) {BiocManager::install('glmGamPoi'); require("glmGamPoi")} # for data normalization, sctransform
if (!require("cowplot")) {install.packages("cowplot"); require("cowplot")} # for figure layout
if (!require("patchwork")) {install.packages("patchwork"); require("patchwork")} # for figure patching
if (!require("openxlsx")) {install.packages("openxlsx"); require("openxlsx")} # to save .xlsx files

# install.packages("styler")



set.seed(12345)
# here()

1 Welcome

Welcome to the Single-Cell Omics Research and Education Club!

If this is your time to the club, I want to extend and extra-special welcome to you!

I’m Jonathan Nelson, an Assistant Professor at the University of Southern California. I’m a wet scientist turned wet+dry scientist. I’ve been working with single-cell RNAseq data for the past 5 years and I’m excited to share what I’ve learned with you.

1.1 SCORE Values

1.1.1 Learning

We believe that bioinformatics is a constantly evolving field, and that ongoing learning and professional development is essential to staying up-to-date. We encourage members to share their knowledge and experiences with each other, and to seek out opportunities for continued learning.

1.1.2 Accessibility

We believe that access to bioinformatics support should be available to everyone. We strive to create a welcoming and inclusive environment where all members can feel comfortable asking for help and contributing to the group.

1.1.3 Collaboration

We believe that working together is key to achieving success in bioinformatics. We value the diversity of perspectives and backgrounds that each member brings, and we encourage open communication and the sharing of ideas.

1.1.4 Integrity

We believe in conducting ourselves with honesty and professionalism in all our interactions. We hold ourselves to high ethical standards and respect the privacy and confidentiality of all members.

1.1.5 Empathy

We believe in approaching each other with empathy and kindness. We understand that bioinformatics can be a challenging and sometimes frustrating field, and we strive to support each other through these difficulties.

1.2 Context and Expectations

I know a lot of this has been going on in the background for everyeone and I wanted to bring it to the forefront. My expectation is that we have about 6 of these meetings together and then we can re-evaluate if we want to continue as a group or not.

Email me you would like me to add anyone:

Today’s code (this html file) will be posted to the SCORE website (https://usckrc.github.io/website/score.html)

2 The Agenda!

2.1 Music and Memes

2.2 Coding Crumbs: Gene Conversion

2.3 Recreating a Figure: Sankey Plot

2.4 Main Theme: CellChat




3 Music and Memes

3.1 This Months Coding Music

The Antlers: Green to Gold

https://open.spotify.com/album/72XWeRa3UZSkxLkGs2kTHo?si=w8GI4_W4QlyeBYmtA6kiUg

3.2 The Memes!











4 Coding Crumbs: Gene Conversion

4.1 Changing between Mouse and Human gene names is common maneuver

4.2 Why?

4.2.1 Translational Research

4.2.2 Target Identification for Therapies

4.2.3 Pathway and Functional Annotation

4.3 It’s a Challenge

4.3.1 Gene Duplications

4.3.2 Unique Names

4.4 Approaches I’ve used before

4.4.1 Packages

4.4.1.1 geneName

https://github.com/mustafapir/geneName

4.4.1.1.1 Install
if (!require("geneName")) {install.packages("geneName"); require("geneName")}

4.4.1.2 Usage

df_human <- mousegnameConverter(df_mouse, "gene")

4.4.2 ChatGPT

4.4.2.1 Use at your own risk.

4.4.2.1.1 Quick…but unreliable

4.4.3 Functions

https://www.biostars.org/p/9567892/

4.4.3.1 Human -> Mouse

convert_human_to_mouse <- function(gene_list) {
    output = c()
    mouse_human_genes = read.csv("https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt",sep="\t")

    for(gene in gene_list) {
          class_key = (mouse_human_genes %>% filter(Symbol == gene & Common.Organism.Name == "human"))[['DB.Class.Key']]
          if( !identical(class_key, integer(0)) ) {
            human_genes = (mouse_human_genes %>% filter(DB.Class.Key == class_key & Common.Organism.Name=="mouse, laboratory"))[,"Symbol"]
            for(human_gene in human_genes) {
                output = rbind(c(gene, human_gene), output)
            }
          }
     }
     return (output)
}

df_mouse_gene <- convert_human_to_mouse(gene_list)

df_mouse_gene <- as.data.frame(df_mouse_gene)

4.4.3.2 Mouse -> Human

convert_mouse_to_human <- function(gene_list) { 
     output = c()
     mouse_human_genes = read.csv("https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt",sep="\t")

     for(gene in gene_list) {
          class_key = (mouse_human_genes %>% filter(Symbol == gene & Common.Organism.Name == "mouse, laboratory"))[['DB.Class.Key']]
          if( !identical(class_key, integer(0)) ) {
               human_genes = (mouse_human_genes %>% filter(DB.Class.Key == class_key & Common.Organism.Name=="human"))[,"Symbol"]
               for(human_gene in human_genes) {
                    output = rbind(c(gene, human_gene), output)
               }
          }
     }
     return (output)
}

df_human_gene <- convert_mouse_to_human(gene_list)

df_human_gene <- as.data.frame(df_human_gene)

4.5 Take Home

4.5.1 Function is what I use now.

4.5.1.1 I should probably download the database localy onto my computer

4.5.2 I suspect there are other options.

4.5.2.1 BioMart?

4.5.2.2 How do others in the chat do this?

5 Recreating a Figure: Sankey Plot

5.1 Final Product!

5.2 Inspiration

Source

Source

Source

Source

5.3 History

Sankey Wiki

5.4 Lessons for Me

5.4.1 Layering a graph with different dataframes

5.4.2 Leveling a graph with data

5.4.3 Setting a color palette

5.5 Starting Place

I wanted to start with easy .xlsx file that was flexible to fill out.

5.5.1 Research Input

5.5.2 Research Output

if (!require("here")) {install.packages("here"); require("here")}
if (!require("openxlsx")) {install.packages("openxlsx"); require("openxlsx")}
if (!require("plotly")) {install.packages("plotly"); require("plotly")}
library(tidyr)

#remotes::install_github("davidsjoberg/ggsankey")
library(ggsankey)

df <- read.xlsx(here("Week 4 CellChat", "data", "Revenue Input.xlsx"))

5.5.3 Create a dataframe with proportional entries

df_subcontract <- df %>%
  uncount(Subcontract.value, .remove = FALSE) %>%
  mutate(Subcontract_Number = Subcontracts) %>% 
  select(Subcontracts)


df_Grants <- df %>%
  uncount(Grants.value, .remove = FALSE) %>%
  mutate(Grants.Number = Grants) %>% 
  select(Grants)

df1 <- df_subcontract
df2 <- df_Grants

max_rows <- max(nrow(df1), nrow(df2))

df1_extended <- df1 %>% 
  bind_rows(tibble(Subcontracts = rep(NA, max_rows - nrow(df1))))

df2_extended <- df2 %>% 
  bind_rows(tibble(Grants = rep(NA_character_, max_rows - nrow(df2))))

df_merged <- cbind(df1_extended , df2_extended)

df_merged$Corporeal <- "JWN"

5.5.4 Convert into a Sankey format

df3 <- df_merged %>% make_long(Subcontracts, Grants, Corporeal) %>% 
  filter(!is.na(node))

5.6 Create Sankey plot

ggplot2 object > sankey layering > theme > plot

Interestingly, the ggplot2 object is blank if you try to plot it.

df3.input <- df3

p1 <- ggplot(df3, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE)

p1 <- p1 + theme_classic() + theme(legend.position = "none")


p1

5.6.1 I found interactive plotly format helpful for troubleshooting

Because categories are not labeled in chart.

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.7 Add Research Output: Level 0

df <- read.xlsx(here("Week 4 CellChat", "data", "Research Output.xlsx"))

df[is.na(df)] <- 0

df_Main <- df %>%
  uncount(Main.value, .remove = FALSE) %>%
  mutate(Main_Number = Main) %>% 
  select(Main)

df3 <- df_Main  
df3$Corporeal <- "JWN"

# Make Corporeal the first column using dplyr

df3 <- df3 %>% select(Corporeal, Main)

df3 <- df3 %>% make_long(Corporeal, Main) %>% 
  filter(!is.na(node))

df4 <- rbind(df3.input, df3)

df4 <- df4 %>% filter(!(x == "Corporeal" & is.na(next_x)))

p1 <- ggplot(df4, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE)


p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.8 Add Level 1

5.8.1 Research Details

research_value <- df %>%
  filter(Main == "Research") %>%
  pull(Main.value) %>% 
  `/`(100)

df_Research.Level1 <- df %>%
    uncount(Research.Level1.value * research_value, .remove = FALSE) %>%
  mutate(Level1 = Research.Level1) %>% 
  select(Level1)

df3 <- df_Research.Level1  
df3$Main <- "Research"

df3 <- df3 %>% select(Main, Level1)

df3 <- df3 %>% make_long(Main, Level1) %>% 
  filter(!is.na(node))

df4 <- rbind(df4, df3)

df5 <- df4 %>% filter(!(x == "Main" & node == "Research" & is.na(next_x)))

p1 <- ggplot(df5, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE)

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.8.2 Service Details

service_value <- df %>%
  filter(Main == "Service") %>%
  pull(Main.value) %>% 
  `/`(100)

df[is.na(df)] <- 0

df_Service.Level1 <- df %>%
  uncount(Service.Level1.value * service_value, .remove = FALSE) %>%
  mutate(Level1 = Service.Level1) %>% 
  select(Level1)

df3 <- df_Service.Level1  
df3$Main <- "Service"

df3 <- df3 %>% select(Main, Level1)

df3 <- df3 %>% make_long(Main, Level1) %>% 
  filter(!is.na(node))

df5 <- rbind(df5, df3)

df6 <- df5 %>% filter(!(x == "Main" & node == "Service" & is.na(next_x)))

p1 <- ggplot(df6, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE
                      ,type = "sankey")

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.8.3 Teaching Details

Teaching_value <- df %>%
  filter(Main == "Teaching and Mentorship") %>%
  pull(Main.value) %>% 
  `/`(100)

df[is.na(df)] <- 0

df_Teaching.Level1 <- df %>%
  uncount(Teaching.Level1.value * Teaching_value, .remove = FALSE) %>%
  mutate(Level1 = Teaching.Level1) %>% 
  select(Level1)

df3 <- df_Teaching.Level1  
df3$Main <- "Teaching and Mentorship"

df3 <- df3 %>% select(Main, Level1)

df3 <- df3 %>% make_long(Main, Level1) %>% 
  filter(!is.na(node))

df6 <- rbind(df6, df3)

df7 <- df6 %>% filter(!(x == "Main" & node == "Teaching and Mentorship" & is.na(next_x)))

p1 <- ggplot(df6, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE
                      ,type = "sankey")

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.9 Add Level 2

5.9.1 Collab

Collab_value <- df %>%
  filter(Research.Level1 == "Collaborative") %>%
  pull(Research.Level1.value) %>% 
  `/`(100)

df[is.na(df)] <- 0

df_Collab.Level2 <- df %>%
  uncount(Collab.Level2.value * Collab_value * research_value, .remove = FALSE) %>%
  mutate(Level2 = Collab.Level2) %>% 
  select(Level2)

df3 <- df_Collab.Level2  
df3$Level1 <- "Collaborative"

df3 <- df3 %>% select(Level1, Level2)

df3 <- df3 %>% make_long(Level1, Level2) %>% 
  filter(!is.na(node))

df8 <- rbind(df7, df3)

df9 <- df8 %>% filter(!(x == "Level1" & node == "Collaborative" & is.na(next_x)))

p1 <- ggplot(df9, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE
                      ,type = "sankey")

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.9.2 Primary Research

df9 <- df8 %>% filter(!(x == "Level1" & node == "Collaborative" & is.na(next_x)))

Primary_value <- df %>%
  filter(Research.Level1 == "Primary") %>%
  pull(Research.Level1.value) %>% 
  `/`(100)

df[is.na(df)] <- 0

df_Research.Level2 <- df %>%
  uncount(Research.Level2.value * Primary_value * research_value, .remove = FALSE) %>%
  mutate(Level2 = Research.Level2) %>% 
  select(Level2)

df3 <- df_Research.Level2  
df3$Level1 <- "Primary"

df3 <- df3 %>% select(Level1, Level2)

df3 <- df3 %>% make_long(Level1, Level2) %>% 
  filter(!is.na(node))

df10 <- rbind(df9, df3)

df11 <- df10 %>% filter(!(x == "Level1" & node == "Primary" & is.na(next_x)))

p1 <- ggplot(df11, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE
                      ,type = "sankey")

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.10 Aesthetics

5.10.1 Factor nodes in correct order

df.test <- df11 

df.test$node <- as.factor(df.test$node)

# levels(df.test$node)

levels_to_assign <- c("Emery R01",
                      "Gurley R01",
                      "Ellison R01",
                      
                      "K01",
                      "Startup",
                      "Subcontracts",
                     
                       "JWN",
                      
                      "Research",
                      "Service",
                      "Teaching and Mentorship",
                      
                      "Collaborative",
                      "Primary",
                      
                      "Committees",
                      "Peer Review",
                      "Journal Club",
                      "Teaching",
                      "Mentorship",
                      "Course Development",
                      
                      
                      "Chung",
                      "Duvoisin",
                      
                      "Ellison",
                      "Emery",
                      "Gurley",
                      "Hutchens",
                      "Srivastava",
                      "McDonough",
                      "Nakai",
                      
                      "Kidney-Specific Drugs",
                      "PFAS and Kidney Disease",
                      "Pdgfrb-DREADD",
                      "Pdgfrb-INTACT",
                      "Pdgfrb-AT1aR KO"
                      
                      )   

                      
df.test$node <- factor(df.test$node, levels = levels_to_assign)


p1 <- ggplot(df.test, aes(x = x
                     , next_x = next_x
                     , node = node
                     , next_node = next_node
                     , fill = factor(node)
                     , label = node)
             )

p1 <- p1 +geom_sankey(flow.alpha = 0.5
                      , node.color = "black"
                      ,show.legend = FALSE)


p1 <- p1 + theme_classic() + theme(legend.position = "none")

p1

p_plotly <- ggplotly(p1, tooltip = "text")

p_plotly

5.10.2 ggplot2 tests

p1 <- p1 + theme_classic() + theme(legend.position = "none") +
  geom_sankey_label(size = 3.5, color = 1, fill = "white")

# remove axis lines and axis labels

p1 <- p1 + theme(axis.line = element_blank(),
                 axis.text.x = element_blank(),
                 axis.text.y = element_blank(),
                 axis.ticks = element_blank(),
                 axis.title.x = element_blank(),
                 axis.title.y = element_blank())


p2 <- p1 + scale_fill_viridis_d(direction = -1, option = "viridis") + 
      scale_colour_viridis_d(direction = -1, option = "viridis")
p2

p3 <- p1 + scale_fill_viridis_d(direction = 1, option = "viridis") + 
      scale_colour_viridis_d(direction = 1, option = "viridis")
p3

p4 <- p1 + scale_fill_viridis_d(direction = -1, option = "turbo") + 
      scale_colour_viridis_d(direction = -1, option = "turbo")
p4

5.11 Final Sankey Plot

5.11.1 Adjust size of plot in YAML

5.11.2 Pick Color Pallette

5.11.3 Add Labels to for Research Revenue and Research Output

p2 <- p1 + scale_fill_viridis_d(direction = -1, option = "viridis") + 
      scale_colour_viridis_d(direction = -1, option = "viridis") +  
      annotate("text", x = 2, y = 200, label = "Research Revenue", color = "black", size = 7) +
      annotate("text", x = 4, y = 200, label = "Research Output", color = "black", size = 7) 

p2

6 Main Theme: CellChat

6.1 Switch to Kevin Burfeind






7 Session Survey

7.1 Community Questions

Do you have a coding problem that you’d like some support on?
Do you have a topic you’d like covered at a future meeting?

Email me:

7.2 Upcoming Schedule

8 Session Info

sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggsankey_0.0.99999          plotly_4.10.4              
##  [3] openxlsx_4.2.8              patchwork_1.3.0            
##  [5] cowplot_1.1.3               glmGamPoi_1.18.0           
##  [7] sctransform_0.4.1           RColorBrewer_1.1-3         
##  [9] kableExtra_1.4.0            styler_1.10.3              
## [11] lubridate_1.9.4             forcats_1.0.0              
## [13] stringr_1.5.1               purrr_1.0.4                
## [15] readr_2.1.5                 tidyr_1.3.1                
## [17] tibble_3.2.1                tidyverse_2.0.0            
## [19] DESeq2_1.46.0               SummarizedExperiment_1.36.0
## [21] Biobase_2.66.0              MatrixGenerics_1.18.1      
## [23] matrixStats_1.5.0           GenomicRanges_1.58.0       
## [25] GenomeInfoDb_1.42.3         IRanges_2.40.1             
## [27] S4Vectors_0.44.0            BiocGenerics_0.52.0        
## [29] EnhancedVolcano_1.24.0      ggrepel_0.9.6              
## [31] here_1.0.1                  BiocManager_1.30.25        
## [33] ggplot2_3.5.1               knitr_1.50                 
## [35] SeuratObject_5.0.2          Seurat_4.4.0               
## [37] dplyr_1.1.4                
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22        splines_4.4.3           later_1.4.1            
##   [4] R.oo_1.27.0             polyclip_1.10-7         lifecycle_1.0.4        
##   [7] rprojroot_2.0.4         globals_0.16.3          lattice_0.22-6         
##  [10] MASS_7.3-64             crosstalk_1.2.1         magrittr_2.0.3         
##  [13] sass_0.4.9              rmarkdown_2.29          jquerylib_0.1.4        
##  [16] yaml_2.3.10             httpuv_1.6.15           zip_2.3.2              
##  [19] spam_2.11-1             sp_2.2-0                spatstat.sparse_3.1-0  
##  [22] reticulate_1.41.0.1     pbapply_1.7-2           abind_1.4-8            
##  [25] zlibbioc_1.52.0         Rtsne_0.17              R.cache_0.16.0         
##  [28] R.utils_2.13.0          GenomeInfoDbData_1.2.13 irlba_2.3.5.1          
##  [31] listenv_0.9.1           spatstat.utils_3.1-3    goftest_1.2-3          
##  [34] spatstat.random_3.3-2   fitdistrplus_1.2-2      parallelly_1.42.0      
##  [37] svglite_2.1.3           leiden_0.4.3.1          codetools_0.2-20       
##  [40] DelayedArray_0.32.0     xml2_1.3.8              tidyselect_1.2.1       
##  [43] UCSC.utils_1.2.0        farver_2.1.2            spatstat.explore_3.3-4 
##  [46] jsonlite_1.9.1          progressr_0.15.1        ggridges_0.5.6         
##  [49] survival_3.8-3          systemfonts_1.2.1       tools_4.4.3            
##  [52] ica_1.0-3               Rcpp_1.0.14             glue_1.8.0             
##  [55] gridExtra_2.3           SparseArray_1.6.2       xfun_0.51              
##  [58] withr_3.0.2             fastmap_1.2.0           digest_0.6.37          
##  [61] timechange_0.3.0        R6_2.6.1                mime_0.12              
##  [64] colorspace_2.1-1        scattermore_1.2         tensor_1.5             
##  [67] spatstat.data_3.1-6     R.methodsS3_1.8.2       generics_0.1.3         
##  [70] data.table_1.17.0       httr_1.4.7              htmlwidgets_1.6.4      
##  [73] S4Arrays_1.6.0          uwot_0.2.3              pkgconfig_2.0.3        
##  [76] gtable_0.3.6            lmtest_0.9-40           XVector_0.46.0         
##  [79] htmltools_0.5.8.1       dotCall64_1.2           scales_1.3.0           
##  [82] png_0.1-8               spatstat.univar_3.1-2   rstudioapi_0.17.1      
##  [85] tzdb_0.5.0              reshape2_1.4.4          nlme_3.1-167           
##  [88] cachem_1.1.0            zoo_1.8-13              KernSmooth_2.23-26     
##  [91] parallel_4.4.3          miniUI_0.1.1.1          pillar_1.10.1          
##  [94] grid_4.4.3              vctrs_0.6.5             RANN_2.6.2             
##  [97] promises_1.3.2          xtable_1.8-4            cluster_2.1.8          
## [100] evaluate_1.0.3          cli_3.6.4               locfit_1.5-9.12        
## [103] compiler_4.4.3          rlang_1.1.5             crayon_1.5.3           
## [106] future.apply_1.11.3     labeling_0.4.3          plyr_1.8.9             
## [109] stringi_1.8.4           viridisLite_0.4.2       deldir_2.0-4           
## [112] BiocParallel_1.40.0     munsell_0.5.1           lazyeval_0.2.2         
## [115] spatstat.geom_3.3-5     Matrix_1.7-2            hms_1.1.3              
## [118] future_1.34.0           shiny_1.10.0            ROCR_1.0-11            
## [121] igraph_2.1.4            bslib_0.9.0

Identifying intercellular (cell-cell) interactions has remained elusive to many fields of biology. The emergence of single cell RNA sequencing technologies has presented us with a tremendous amount of data and a potentially infinite amount of receptor-ligand interactions.

There are several different packages/algorithms that attempt to infer intercellular interactions from sc/snRNASeq data. These packages were nicely review and combined in a Nature Communications paper

There are pros and cons to each fo the different algorithms. The most commonly used are CellPhoneDB and CellChat. I like CellChat because of its numerous visualization methods that are great for papers and presentations.

Today I’ll work through a single dataset using CellChat. I will go through some of CellChat’s diverse functionalities by applying it to a mouse single nuclear dataset.

CellChat requires gene expression data of cells as the user input and models the probability of cell-cell communication by integrating gene expression with prior knowledge of the interactions between signaling ligands, receptors and their cofactors.

Link to CellChat V1 paper

Link to CellChat V2 paper

Upon infering the intercellular communication network, CellChat provides functionality for further data exploration, analysis, and visualization.

Link to CellChat Github

Link to CellChat Vignette

Load the required libraries


library(BiocNeighbors)
library(CellChat)
library(patchwork)
library(presto)
library(here)
library(Seurat)
options(stringsAsFactors = FALSE)

Part I: Data input & processing and initialization of CellChat object

CellChat requires two user-defined inputs: one is the gene expression data of cells, and the other is either user assigned cell labels (i.e., label-based mode) or a low-dimensional representation of the single-cell data (i.e., label-free mode). In this case, we will use labeled kidney cells, from KGB and JWN’s publication in Physiological Genomics.

This is a mouse single-nucleus RNAseq dataset. Mice underwent cardiac arrest then resuscitation (a global ischemia model) or sham procedure.

Load data

For the gene expression data matrix, genes should be in rows with rownames and cells in columns with colnames. Normalized data (e.g., library-size normalization and then log-transformed with a pseudocount of 1) is required as input for CellChat analysis.

If the user provides count data, CellChat provides a normalizeData function to account for library size and then do log-transformed. For the cell group information, a dataframe with rownames is required as input for CellChat.

# We will start by loading the .rds file that Jonathan generated

CACPR_integrated <- readRDS(here("Week 4 CellChat", "data", "CACPR_integrated.rds"))

Create a CellChat object

USERS can create a new CellChat object from a data matrix, Seurat or SingleCellExperiment object. If input is a Seurat or SingleCellExperiment object, the meta data in the object will be used by default and USER must provide group.by to define the cell groups. e.g, group.by = “ident” for the default cell identities in Seurat object.

NB: If USERS load previously calculated CellChat object (version < 0.5.0), please update the object via updateCellChat

#Start by changing the assay to RNA if applicable
DefaultAssay(CACPR_integrated) <- "RNA"
cellchat <- createCellChat(object = CACPR_integrated,
                           group.by = "subclass.CACPR")
#> [1] "Create a CellChat object from a Seurat object"
#> The `data` slot in the default assay is used. The default assay is RNA 
#> The `meta.data` slot in the Seurat object is used as cell meta information
#> Warning in createCellChat(object = CACPR_integrated, group.by = "subclass.CACPR"): The 'meta' data does not have a column named `samples`. We now add this column and all cells are assumed to belong to `sample1`!
#> Set cell identities for the new CellChat object 
#> The cell groups used for CellChat analysis are  PTS1, PTS2, PTS3, PTinj, DTL, TAL1, TAL2, MD, DCT1, DCT2, CNT, PC, ICA, ICB, URO, PEC, MES, FIB, CONTRACT, PODO, EC, LYMPH, MACRO, LYMPHO

# set the database#
CellChatDB <- CellChatDB.mouse# use CellChatDB.mouse if running on mouse data
showDatabaseCategory(CellChatDB)


# subset the expression data of signaling genes for saving computation cost
cellchat@DB <- CellChatDB
cellchat <- subsetData(cellchat) # This step is necessary even if using the whole database
cellchat <- identifyOverExpressedGenes(cellchat)

Set the ligand-receptor interaction database

CellChatDB is a manually curated database of literature-supported ligand-receptor interactions in both human and mouse. CellChatDB in mouse contains 2,021 validated molecular interactions, including 60% of secrete autocrine/paracrine signaling interactions, 21% of extracellular matrix (ECM)-receptor interactions and 19% of cell-cell contact interactions. CellChatDB in human contains 1,939 validated molecular interactions, including 61.8% of paracrine/autocrine signaling interactions, 21.7% of extracellular matrix (ECM)-receptor interactions and 16.5% of cell-cell contact interactions.

Users can update CellChatDB by adding their own curated ligand-receptor pairs.There is a vignette to do that

CellChatDB <- CellChatDB.mouse # use CellChatDB.human if running on human data
#showDatabaseCategory(CellChatDB)
# You can look at the structure of the receptor-ligand pairs here
#df <- dplyr::glimpse(CellChatDB$interaction)

# We will use "CellChatDB", which includes all of the receptor-ligand pairs - the default CellChatDB for cell-cell communication analysis
cellchat@DB <- CellChatDB

# However, you can use a subset of CellChatDB for cell-cell communication analysis
# For example, you can use Secreted Signaling
#CellChatDB.use <- subsetDB(CellChatDB, search = "Secreted Signaling")
# Then set the used database in the object
#cellchat@DB <- CellChatDB.use

Preprocessing the expression data for cell-cell communication analysis

To infer the cell state-specific communications, CellChat identifies over-expressed ligands or receptors in one cell group and then identifies over-expressed ligand-receptor interactions if either ligand or receptor is over-expressed.

# subset the expression data of signaling genes for saving computation cost
cellchat <- subsetData(cellchat) # This step is necessary even if using the whole database
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)
#> The number of highly variable ligand-receptor pairs used for signaling inference is 2831

# project gene expression data onto PPI (Optional: when running it, USER should set `raw.use = FALSE` in the function `computeCommunProb()` in order to use the projected data)
# cellchat <- projectData(cellchat, PPI.human)

Part II: Inference of cell-cell communication network

CellChat infers the biologically significant cell-cell communication by assigning each interaction with a probability value and peforming a permutation test. CellChat models the probability of cell-cell communication by integrating gene expression with prior known receptor-ligand interaction pairs using the law of mass action (meaning higher expression of ligand = stronger interaction).

The number of inferred ligand-receptor pairs clearly depends on the method for calculating the average gene expression per cell group. -CellChat uses ‘trimean’ which produces fewer interactions than other methods. However, CellChat performs well at predicting stronger interactions, which is very helpful for narrowing down on interactions for further experimental validations.

In computeCommunProb, Cellchat provide’s an option for using other methods, such as 5% and 10% truncated mean, to calculating the average gene expression. Of note, ‘trimean’ approximates 25% truncated mean, implying that the average gene expression is zero if the percent of expressed cells in one group is less than 25%.

You can change that value, for example by specifiying a 10% truncated mean. Do this by setting type = "truncatedMean" and trim = 0.1. The function computeAveExpr can help to check the average expression of signaling genes of interest, e.g, computeAveExpr(cellchat, features = c("CXCL12","CXCR4"), type = "truncatedMean", trim = 0.1).

When analyzing unsorted single-cell transcriptomes, under the assumption that abundant cell populations tend to send collectively stronger signals than the rare cell populations, CellChat can also consider the effect of cell proportion in each cell group in the probability calculation. USER can set population.size = TRUE.

Compute the communication probability and infer cellular communication network

If well-known signaling pathways in the studied biological process are not predicted, USER can try truncatedMean to change the method for calculating the average gene expression per cell group.

This is a very long step, especially for large datasets

So it’s helpful to save the cellchat as an .rds object to load directly for future analysis.


cellchat <- readRDS(here::here("Week 4 CellChat", "data", "CACPR_CellChat.rds"))

# cellchat <- computeCommunProb(cellchat)

# Filter out the cell-cell communication if there are only few number of cells in certain cell groups
# cellchat <- filterCommunication(cellchat, min.cells = 10)
 
# saveRDS(cellchat, file = here::here("Week 4 CellChat", "data", "CACPR_CellChat.rds"))

df.net <- subsetCommunication(cellchat)

Extract the inferred cellular communication network as a data frame

We provide a function subsetCommunication to easily access the inferred cell-cell communications of interest. For example,

  • df.net <- subsetCommunication(cellchat) returns a data frame consisting of all the inferred cell-cell communications at the level of ligands/receptors. Set slot.name = "netP" to access the the inferred communications at the level of signaling pathways

  • df.net <- subsetCommunication(cellchat, sources.use = c(1,2), targets.use = c(4,5)) gives the inferred cell-cell communications sending from cell groups 1 and 2 to cell groups 4 and 5.

  • df.net <- subsetCommunication(cellchat, signaling = c("WNT", "TGFb")) gives the inferred cell-cell communications mediated by signaling WNT and TGFb.

Infer the cell-cell communication at a signaling pathway level

CellChat computes the communication probability on signaling pathway level by summarizing the communication probabilities of all ligands-receptors interactions associated with each signaling pathway.

NB: The inferred intercellular communication network of each ligand-receptor pair and each signaling pathway is stored in the slot ‘net’ and ‘netP’, respectively.

cellchat <- computeCommunProbPathway(cellchat)

Calculate the aggregated cell-cell communication network

We can calculate the aggregated cell-cell communication network by counting the number of links or summarizing the communication probability. USER can also calculate the aggregated network among a subset of cell groups by setting sources.use and targets.use.

cellchat <- aggregateNet(cellchat)

#You can visualize the significant pathways with the following command
cellchat@netP$pathways
#>  [1] "COLLAGEN"      "SPP1"          "LAMININ"       "APP"          
#>  [5] "FGF"           "PTPRM"         "BMP"           "CDH"          
#>  [9] "EGF"           "SLIT"          "SEMA3"         "NRG"          
#> [13] "ADGRL"         "Cholesterol"   "VEGF"          "ADGRG"        
#> [17] "CADM"          "EPHA"          "Netrin"        "THBS"         
#> [21] "PDGF"          "PROS"          "ANGPTL"        "GAS"          
#> [25] "SEMA6"         "CDH1"          "AGRN"          "UNC5"         
#> [29] "TENASCIN"      "Glutamate"     "PECAM1"        "PTN"          
#> [33] "RELN"          "DHEA"          "JAM"           "NECTIN"       
#> [37] "NCAM"          "ANGPT"         "SEMA7"         "KLK"          
#> [41] "Testosterone"  "HGF"           "Prostaglandin" "NOTCH"        
#> [45] "CypA"          "EPHB"          "SEMA4"         "FLRT"         
#> [49] "ADGRE"         "CD45"          "KIT"           "NGL"          
#> [53] "BRADYKININ"    "HSPG"          "MPZ"           "CSF"          
#> [57] "OCLN"          "GDF"           "PECAM2"        "CSPG4"        
#> [61] "WNT"           "IGFBP"         "GAP"           "NRXN"         
#> [65] "CDH5"          "CCL"           "CD39"          "PLAU"         
#> [69] "CD46"          "ncWNT"         "ESAM"          "NEGR"         
#> [73] "GRN"           "NPNT"          "CLDN"          "PVR"          
#> [77] "Adenosine"     "CD96"

We can also visualize the aggregated cell-cell communication network. For example, showing the number of interactions or the total interaction strength (weights) between any two cell groups using circle plot.

groupSize <- as.numeric(table(cellchat@idents))
par(mfrow = c(1,2), xpd=TRUE)
netVisual_circle(cellchat@net$count, vertex.weight = groupSize, weight.scale = T, label.edge= F, title.name = "Number of interactions")
netVisual_circle(cellchat@net$weight, vertex.weight = groupSize, weight.scale = T, label.edge= F, title.name = "Interaction weights/strength")

Due to the complicated cell-cell communication network, we can examine the signaling sent from each cell group. Here we also control the parameter edge.weight.max so that we can compare edge weights between differet networks.

mat <- cellchat@net$weight
par(mfrow = c(3,4), xpd=TRUE)
for (i in 1:nrow(mat)) {
  mat2 <- matrix(0, nrow = nrow(mat), ncol = ncol(mat), dimnames = dimnames(mat))
  mat2[i, ] <- mat[i, ]
  netVisual_circle(mat2, vertex.weight = groupSize, weight.scale = T, edge.weight.max = max(mat), title.name = rownames(mat)[i])
}

Part III: Visualization of cell-cell communication network

Upon infering the cell-cell communication network, CellChat provides various functionality for further data exploration, analysis, and visualization.

Visualize each signaling pathway using Hierarchy plot, Circle plot or Chord diagram

Hierarchy plot: USER should define vertex.receiver, which is a numeric vector giving the index of the cell groups as targets in the left part of hierarchy plot. This hierarchical plot consist of two components: the left portion shows autocrine and paracrine signaling to certain cell groups of interest (i.e, the defined vertex.receiver), and the right portion shows autocrine and paracrine signaling to the remaining cell groups in the dataset. Thus, hierarchy plot provides an informative and intuitive way to visualize autocrine and paracrine signaling communications between cell groups of interest. For example, when studying the cell-cell communication between fibroblasts and immune cells, USER can define vertex.receiver as all fibroblast cell groups.

Chord diagram: CellChat provides two functions netVisual_chord_cell and netVisual_chord_gene for visualizing cell-cell communication with different purposes and different levels. netVisual_chord_cell is used for visualizing the cell-cell communication between different cell groups (where each sector in the chord diagram is a cell group), and netVisual_chord_gene is used for visualizing the cell-cell communication mediated by mutiple ligand-receptors or signaling pathways (where each sector in the chord diagram is a ligand, receptor or signaling pathway.)

Explnations of edge color/weight, node color/size/shape: In all visualization plots, edge colors are consistent with the sources as sender, and edge weights are proportional to the interaction strength. Thicker edge line indicates a stronger signal. In the Hierarchy plot and Circle plot, circle sizes are proportional to the number of cells in each cell group. In the hierarchy plot, solid and open circles represent source and target, respectively. In the Chord diagram, the inner thinner bar colors represent the targets that receive signal from the corresponding outer bar. The inner bar size is proportional to the signal strength received by the targets. Such inner bar is helpful for interpreting the complex chord diagram. Note that there exist some inner bars without any chord for some cell groups, please just igore it because this is an issue that has not been addressed by circlize package.

Visualization of cell-cell communication at different levels: One can visualize the inferred communication network of signaling pathways using netVisual_aggregate, and visualize the inferred communication networks of individual L-R pairs associated with that signaling pathway using netVisual_individual.

Here we take input of one signaling pathway as an example. All the signaling pathways showing significant communications can be accessed by cellchat@netP$pathways.

pathways.show <- c("PDGF") 
# Hierarchy plot
# Here we define `vertex.receive` so that the left portion of the hierarchy plot shows signaling to fibroblast and the right portion shows signaling to immune cells 
netVisual_aggregate(cellchat, signaling = pathways.show,  vertex.receiver = vertex.receiver)

# Circle plot
par(mfrow=c(1,1))
netVisual_aggregate(cellchat, signaling = pathways.show, layout = "circle")
# Chord diagram
par(mfrow=c(1,1))
netVisual_aggregate(cellchat, signaling = pathways.show, layout = "chord")

# Heatmap
par(mfrow=c(1,1))
netVisual_heatmap(cellchat, signaling = pathways.show, color.heatmap = "Reds")
#> Do heatmap based on a single object

Compute the contribution of each ligand-receptor pair to the overall signaling pathway and visualize cell-cell communication mediated by a single ligand-receptor pair

netAnalysis_contribution(cellchat, signaling = pathways.show)

We can also visualize the cell-cell communication mediated by a single ligand-receptor pair. We provide a function extractEnrichedLR to extract all the significant interactions (L-R pairs) and related signaling genes for a given signaling pathway.


pairLR.PDGF <- extractEnrichedLR(cellchat, signaling = pathways.show, geneLR.return = FALSE)
LR.show <- pairLR.PDGF[1,] # show one ligand-receptor pair
# Hierarchy plot
vertex.receiver = seq(1,4) # a numeric vector
netVisual_individual(cellchat, signaling = pathways.show,  pairLR.use = LR.show, vertex.receiver = vertex.receiver)

#> [[1]]
# Circle plot
netVisual_individual(cellchat, signaling = pathways.show, pairLR.use = LR.show, layout = "circle")
#> [[1]]
# Chord diagram
netVisual_individual(cellchat, signaling = pathways.show, pairLR.use = LR.show, layout = "chord")

#> [[1]]

Automatically save the plots of the all inferred network for quick exploration

In practical use, USERS can use ‘for … loop’ to automatically save the all inferred network for quick exploration using netVisual. netVisual supports an output in the formats of svg, png and pdf.

# Access all the signaling pathways showing significant communications
pathways.show.all <- cellchat@netP$pathways
# check the order of cell identity to set suitable vertex.receiver
levels(cellchat@idents)
vertex.receiver = seq(1,4)
for (i in 1:length(pathways.show.all)) {
  # Visualize communication network associated with both signaling pathway and individual L-R pairs
  netVisual(cellchat, signaling = pathways.show.all[i], vertex.receiver = vertex.receiver, layout = "hierarchy")
  # Compute and visualize the contribution of each ligand-receptor pair to the overall signaling pathway
  gg <- netAnalysis_contribution(cellchat, signaling = pathways.show.all[i])
  ggsave(filename=paste0(pathways.show.all[i], "_L-R_contribution.pdf"), plot=gg, width = 3, height = 2, units = 'in', dpi = 300)
}

Plot the signaling gene expression distribution using violin/dot plot

We can plot the gene expression distribution of signaling genes related to L-R pairs or signaling pathway using a Seurat wrapper function plotGeneExpression.

plotGeneExpression(cellchat, signaling = "PDGF")
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.

Part IV: CellChat App

You can also visualize the inferred cell-cell communication network using the CellChat app. The app provides a user-friendly interface for visualizing and exploring the cell-cell communication network.


runCellChatApp(cellchat)


sessionInfo()
#> R version 4.4.3 (2025-02-28 ucrt)
#> Platform: x86_64-w64-mingw32/x64
#> Running under: Windows 11 x64 (build 22631)
#> 
#> Matrix products: default
#> 
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.utf8 
#> [2] LC_CTYPE=English_United States.utf8   
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.utf8    
#> 
#> time zone: America/Los_Angeles
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] SeuratObject_5.0.2  Seurat_4.4.0        here_1.0.1         
#>  [4] presto_1.0.0        data.table_1.17.0   Rcpp_1.0.14        
#>  [7] patchwork_1.3.0     CellChat_2.1.2      Biobase_2.66.0     
#> [10] BiocGenerics_0.52.0 ggplot2_3.5.1       igraph_2.1.4       
#> [13] dplyr_1.1.4         BiocNeighbors_2.0.1
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppAnnoy_0.0.22       splines_4.4.3          later_1.4.1           
#>   [4] tibble_3.2.1           polyclip_1.10-7        ggnetwork_0.5.13      
#>   [7] lifecycle_1.0.4        rstatix_0.7.2          doParallel_1.0.17     
#>  [10] rprojroot_2.0.4        globals_0.16.3         lattice_0.22-6        
#>  [13] MASS_7.3-64            backports_1.5.0        magrittr_2.0.3        
#>  [16] plotly_4.10.4          sass_0.4.9             rmarkdown_2.29        
#>  [19] jquerylib_0.1.4        yaml_2.3.10            httpuv_1.6.15         
#>  [22] NMF_0.28               sctransform_0.4.1      spam_2.11-1           
#>  [25] sp_2.2-0               spatstat.sparse_3.1-0  reticulate_1.41.0.1   
#>  [28] cowplot_1.1.3          pbapply_1.7-2          RColorBrewer_1.1-3    
#>  [31] abind_1.4-8            Rtsne_0.17             purrr_1.0.4           
#>  [34] circlize_0.4.16        IRanges_2.40.1         S4Vectors_0.44.0      
#>  [37] ggrepel_0.9.6          irlba_2.3.5.1          listenv_0.9.1         
#>  [40] spatstat.utils_3.1-3   goftest_1.2-3          RSpectra_0.16-2       
#>  [43] spatstat.random_3.3-2  fitdistrplus_1.2-2     parallelly_1.42.0     
#>  [46] svglite_2.1.3          leiden_0.4.3.1         codetools_0.2-20      
#>  [49] tidyselect_1.2.1       shape_1.4.6.1          farver_2.1.2          
#>  [52] matrixStats_1.5.0      stats4_4.4.3           spatstat.explore_3.3-4
#>  [55] jsonlite_1.9.1         GetoptLong_1.0.5       progressr_0.15.1      
#>  [58] Formula_1.2-5          ggridges_0.5.6         ggalluvial_0.12.5     
#>  [61] survival_3.8-3         iterators_1.0.14       systemfonts_1.2.1     
#>  [64] foreach_1.5.2          tools_4.4.3            sna_2.8               
#>  [67] ica_1.0-3              glue_1.8.0             gridExtra_2.3         
#>  [70] xfun_0.51              withr_3.0.2            BiocManager_1.30.25   
#>  [73] fastmap_1.2.0          digest_0.6.37          R6_2.6.1              
#>  [76] mime_0.12              colorspace_2.1-1       scattermore_1.2       
#>  [79] Cairo_1.6-2            tensor_1.5             spatstat.data_3.1-6   
#>  [82] tidyr_1.3.1            generics_0.1.3         FNN_1.1.4.1           
#>  [85] httr_1.4.7             htmlwidgets_1.6.4      uwot_0.2.3            
#>  [88] pkgconfig_2.0.3        gtable_0.3.6           registry_0.5-1        
#>  [91] ComplexHeatmap_2.22.0  lmtest_0.9-40          htmltools_0.5.8.1     
#>  [94] carData_3.0-5          dotCall64_1.2          clue_0.3-66           
#>  [97] scales_1.3.0           png_0.1-8              spatstat.univar_3.1-2 
#> [100] knitr_1.50             rstudioapi_0.17.1      reshape2_1.4.4        
#> [103] rjson_0.2.23           coda_0.19-4.1          statnet.common_4.11.0 
#> [106] nlme_3.1-167           cachem_1.1.0           zoo_1.8-13            
#> [109] GlobalOptions_0.1.2    stringr_1.5.1          KernSmooth_2.23-26    
#> [112] vipor_0.4.7            parallel_4.4.3         miniUI_0.1.1.1        
#> [115] ggrastr_1.0.2          pillar_1.10.1          grid_4.4.3            
#> [118] vctrs_0.6.5            RANN_2.6.2             promises_1.3.2        
#> [121] ggpubr_0.6.0           car_3.1-3              xtable_1.8-4          
#> [124] cluster_2.1.8          beeswarm_0.4.0         evaluate_1.0.3        
#> [127] cli_3.6.4              compiler_4.4.3         rlang_1.1.5           
#> [130] crayon_1.5.3           rngtools_1.5.2         future.apply_1.11.3   
#> [133] ggsignif_0.6.4         labeling_0.4.3         ggbeeswarm_0.7.2      
#> [136] plyr_1.8.9             stringi_1.8.4          viridisLite_0.4.2     
#> [139] network_1.19.0         deldir_2.0-4           gridBase_0.4-7        
#> [142] munsell_0.5.1          lazyeval_0.2.2         spatstat.geom_3.3-5   
#> [145] Matrix_1.7-2           future_1.34.0          shiny_1.10.0          
#> [148] ROCR_1.0-11            broom_1.0.7            bslib_0.9.0