Introduction

The procedures in this step aim to determine the enrichment of differentially expressed cell type gene markers in each cell cluster. Thus, the overall assumption is that the cells relationship in each cluster group also determine the cell type that characterize each cluster.

Pre-requirements

To start the procedures we consider:

  • set of Seurat cell clusters with a high-resolution parameter (resolution 2 - 3).
  • list of cell type markers.
  • DEG (one vs all) for each Seurat cluster.

For this tutorial the following clusters were considered

DimPlot(aux.seurat, reduction="umap")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

Differential expression of genes per cluster

The differential expression analysis is performed using MAST approach. It may take few hours depending on the number of cells and clusters.

markers <- FindAllMarkers(aux.seurat, test.use = "MAST")
write.table(markers, file = "DEG_seurat_clusters_res3.csv", quote = F, sep = ",")

Let’s take a look at the general cell type markers considered for the assigments. Other genes marker can be included to the list.

Table of cell type marker genes

table.gene.markers <- data.frame(Gene=c("EPCAM", "KRT8", "KRT18", "KRT19", "KRT10", "KRT7", "FOXJ1"), Cell="Epithelial_cells")
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("ACTA2"), Cell="Smooth_Muscle_cells"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("DCN", "COL11A2", "FAP", "PDGFRA", "COL11A1", "COL1A1", "PDGFRB"), Cell="Mesenchymal_cells"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("LYZ", "CD14", "MME", "C1QA", "CLEC10A"), Cell="Myeloid_cells"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("CLDN5", "PECAM1", "CD34", "ESAM"), Cell="Endothelial_cells"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("CD79A", "JCHAIN"), Cell="B/Plasma_cells"))
#table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("JCHAIN"), Cell="B Cells"))
#table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("TYROBP", "FCGR3A"), Cell="Natural Killer"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("TPSB2"), Cell="Mast_cells"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("HBB", "GYPA"), Cell="Erythrocytes"))
table.gene.markers <- rbind(table.gene.markers, data.frame(Gene=c("CD2", "CD3D", "CD3E", "CD3G", 'CD8A', "CCL5", "PTPRC", "TYROBP", "FCGR3A"), Cell="T/NK Cells"))

grid::grid.newpage()
gridExtra::grid.table(table.gene.markers, rows=NULL)

Bellow you can find auxiliary functions to count the number of cell marker genes that are differentialy expressed in each cluster considering fold change >= 0.2 and p-value <= 0.05

markers = read.delim("files/DEG_seurat_clusters_res3.csv", sep = ",")

get_markers <- function(cell) {
  return(table.gene.markers$Gene[which(table.gene.markers$Cell == cell)])
}

count_marker_DEG <- function(table.gene.markers, deg, fc=0.2, pval=0.05) {

  clusters = unique(deg$cluster)
  counts = list()

  for (x in (unique(table.gene.markers$Cell))) {
    table = NULL
    for (i in clusters) {
      filt = deg[which(deg$cluster == i),]
      if (nrow(filt) > 0) {
        res = sum(filt$avg_log2FC >= fc & filt$p_val_adj <= pval & filt$gene %in% get_markers(cell = x))
      }else
        res = 0

      table = rbind(table, data.frame(Cluster=i, MarkesDEG=res))
    }
    counts[[x]] <- table
  }
  return(counts)
}

Lets now use the functions above to count the DEG marks using our previous gene table list.

counts_deg = count_marker_DEG(table.gene.markers, markers, fc = 0.1)
counts_deg = do.call(cbind, counts_deg)
rownames(counts_deg) = counts_deg$Epithelial_cells.Cluster

In this step we normalize the counts by the number of the total markers for each corresponding cell type.

row.bar = data.frame(table(Idents(aux.seurat)))
row_ha = rowAnnotation(ClusterSize = anno_barplot(row.bar$Freq))

markers.count = data.frame(table(table.gene.markers$Cell))
markers.count = markers.count[match(colnames(counts_deg[,seq(2, ncol(counts_deg), by = 2)]), paste0(markers.count$Var1, ".MarkesDEG")),]
markers.count$ColumnName = paste0(markers.count$Var1, " (n=", markers.count$Freq, ")")
head(markers.count)
##                  Var1 Freq                ColumnName
## 3    Epithelial_cells    7    Epithelial_cells (n=7)
## 8 Smooth_Muscle_cells    1 Smooth_Muscle_cells (n=1)
## 6   Mesenchymal_cells    7   Mesenchymal_cells (n=7)
## 7       Myeloid_cells    5       Myeloid_cells (n=5)
## 2   Endothelial_cells    4   Endothelial_cells (n=4)
## 1      B/Plasma_cells    2      B/Plasma_cells (n=2)
y = counts_deg[,seq(2, ncol(counts_deg), by = 2)]
x = markers.count$Freq
z = (t(t(y) / x))

rownames(z) = rownames(counts_deg)
colnames(z) = markers.count$ColumnName
head(z)
##   Epithelial_cells (n=7) Smooth_Muscle_cells (n=1) Mesenchymal_cells (n=7)
## 0                      0                         0               0.1428571
## 1                      0                         0               0.0000000
## 2                      0                         0               0.2857143
## 3                      0                         0               0.0000000
## 4                      0                         0               0.0000000
## 5                      0                         0               0.0000000
##   Myeloid_cells (n=5) Endothelial_cells (n=4) B/Plasma_cells (n=2)
## 0                   0                       0                    0
## 1                   0                       0                    0
## 2                   0                       0                    0
## 3                   0                       0                    0
## 4                   0                       0                    0
## 5                   0                       1                    0
##   Mast_cells (n=1) Erythrocytes (n=2) T/NK Cells (n=9)
## 0                0                  0        0.0000000
## 1                0                  0        0.0000000
## 2                0                  0        0.0000000
## 3                0                  0        0.7777778
## 4                0                  0        0.6666667
## 5                0                  0        0.0000000

Next, let’s visualize the enrichment of marks (DEG counts proportion) as a heatmap plot.

ht1 = Heatmap(z,
              na_col = "white",
              cluster_rows = T,
              cluster_columns = F,
              show_row_names = T,
              row_names_gp = gpar(fontsize = 12),
              row_names_side = "left",
              rect_gp = gpar(col = "black", lwd = 0.6),
              show_column_names = T,
              right_annotation = row_ha,
              col = colorRamp2(c(0, mean(as.matrix(z)), max(z)), c("#FFFFFF" , "#d8b365", "#01665e"))

)

draw(ht1, heatmap_legend_side = "right")

Next, the expression of the markers can also be visualized and evaluated using Seurat DotPlot function. We first change the order of the rows to match the order of the non-supervised clustering obtained, obtained previously from the marker count proportion.

aux.seurat@active.ident <- factor(aux.seurat@active.ident,
                                  levels=rownames(z)[rev(row_order(ht1))]
)

DotPlot(object = aux.seurat, features = rev(unique(table.gene.markers$Gene))) +
  scale_colour_gradient2(low = "#2166ac", mid = "#f7f7f7", high = "#b2182b") +
  RotatedAxis() +
  theme(axis.text.x = element_text(size = 12))

Cluster assignments procedures

In this section we defined some rules to systematically assign the cell type for each cluster. The code corresponds to the workflow image in the main tab at the navigation menu.

expr.markers = DotPlot(object = aux.seurat, features = rev(unique(table.gene.markers$Gene)))

expr.markers.data = expr.markers$data
krt.genes = table.gene.markers$Gene[grep("KRT", table.gene.markers$Gene)]

my_cluster_col <- data.frame(row.names = rownames(z))
my_cluster_col$CellType = NA

for (r in 1:nrow(z)) {

  c = r -1
  nmarkers = which(z[r,] > 0)
  pct.KRT = expr.markers.data[expr.markers.data$id == c & expr.markers.data$features.plot %in% krt.genes,]

  if ( (sum(pct.KRT$pct.exp > 35) > 0 &  sum(pct.KRT$avg.exp.scaled > 1) > 0) ) {
    my_cluster_col$CellType[r] = "Epithelial_cells"
  } else if ( length(nmarkers) > 1  ){

    nmarkers = z[r,which(z[r,] > 0)]

    if ( length(grep("Smooth_Muscle", names(nmarkers))) > 0 ){

      pos.fibro = grep("Mesenchymal", names(nmarkers))
      if ( length(pos.fibro) > 0 ) {
        if (nmarkers[pos.fibro] > 0.25) {
          my_cluster_col$CellType[r] = "Mesenchymal_cells"
        }else {
          my_cluster_col$CellType[r] = "Smooth_Muscle_cells"
        }
      } else {
        marker = which.max(nmarkers)
        my_cluster_col$CellType[r] = gsub(" .*", "", names(marker))
      }

    } else {
      nmarkers = z[r,which(z[r,] > 0)] * markers.count$Freq[which(markers.count$ColumnName %in% names(nmarkers))]

      if (length(unique(nmarkers)) == 1) {
        nmarkers = z[r,which(z[r,] > 0)]
        marker = which.max(nmarkers)
      } else {
        marker = which.max(nmarkers)
      }
      my_cluster_col$CellType[r] = gsub(" .*", "", names(marker))
    }

  } else if (length(nmarkers) == 1) {
    my_cluster_col$CellType[r] = gsub(" .*", "", names(nmarkers))
  }

}

Let’s now check the table of cell assignments and write the output as a .cvs file and go to the Step 2

my_cluster_col
##                CellType
## 0     Mesenchymal_cells
## 1                  <NA>
## 2     Mesenchymal_cells
## 3                  T/NK
## 4                  T/NK
## 5     Endothelial_cells
## 6     Endothelial_cells
## 7                  T/NK
## 8     Mesenchymal_cells
## 9                  <NA>
## 10    Mesenchymal_cells
## 11                 T/NK
## 12    Mesenchymal_cells
## 13    Mesenchymal_cells
## 14                 T/NK
## 15                 <NA>
## 16  Smooth_Muscle_cells
## 17    Mesenchymal_cells
## 18                 T/NK
## 19    Mesenchymal_cells
## 20        Myeloid_cells
## 21    Mesenchymal_cells
## 22                 T/NK
## 23    Mesenchymal_cells
## 24        Myeloid_cells
## 25                 T/NK
## 26    Mesenchymal_cells
## 27     Epithelial_cells
## 28                 T/NK
## 29       B/Plasma_cells
## 30    Mesenchymal_cells
## 31    Mesenchymal_cells
## 32                 T/NK
## 33                 T/NK
## 34    Mesenchymal_cells
## 35    Mesenchymal_cells
## 36  Smooth_Muscle_cells
## 37                 T/NK
## 38         Erythrocytes
## 39        Myeloid_cells
## 40    Mesenchymal_cells
## 41       B/Plasma_cells
## 42        Myeloid_cells
## 43     Epithelial_cells
## 44    Mesenchymal_cells
## 45    Mesenchymal_cells
## 46                 <NA>
## 47    Mesenchymal_cells
## 48    Mesenchymal_cells
## 49    Endothelial_cells
## 50        Myeloid_cells
## 51  Smooth_Muscle_cells
## 52     Epithelial_cells
## 53     Epithelial_cells
## 54     Epithelial_cells
## 55     Epithelial_cells
## 56                 T/NK
## 57     Epithelial_cells
## 58    Mesenchymal_cells
## 59    Mesenchymal_cells
## 60        Myeloid_cells
## 61                 T/NK
## 62     Epithelial_cells
## 63     Epithelial_cells
## 64    Mesenchymal_cells
## 65     Epithelial_cells
## 66  Smooth_Muscle_cells
## 67     Epithelial_cells
## 68           Mast_cells
## 69         Erythrocytes
## 70        Myeloid_cells
## 71     Epithelial_cells
## 72                 T/NK
## 73                 T/NK
## 74    Mesenchymal_cells
## 75    Mesenchymal_cells
## 76    Mesenchymal_cells
## 77                 T/NK
## 78                 T/NK
## 79     Epithelial_cells
## 80                 T/NK
## 81                 T/NK
## 82         Erythrocytes
## 83    Mesenchymal_cells
## 84        Myeloid_cells
## 85       B/Plasma_cells
## 86  Smooth_Muscle_cells
## 87    Mesenchymal_cells
## 88     Epithelial_cells
## 89                 T/NK
## 90    Endothelial_cells
## 91        Myeloid_cells
## 92                 T/NK
## 93                 T/NK
## 94                 T/NK
## 95    Endothelial_cells
## 96                 T/NK
## 97         Erythrocytes
## 98     Epithelial_cells
## 99    Endothelial_cells
## 100    Epithelial_cells
## 101                <NA>
## 102       Myeloid_cells
## 103   Mesenchymal_cells
## 104                T/NK
## 105                T/NK
## 106    Epithelial_cells
## 107                T/NK
## 108       Myeloid_cells
## 109   Mesenchymal_cells
## 110   Endothelial_cells
## 111    Epithelial_cells
## 112 Smooth_Muscle_cells
## 113                <NA>
write.table(my_cluster_col, 
            "files/reference_panel_clusters_assigned.cvs", 
            sep = ",",
            quote = FALSE)