Background¶

LDpred-2 is one of the dedicated PRS programs which is an R package that uses a Bayesian approach to polygenic risk scoring.

Installing LDpred-2¶

Note

The script used here is based on LDpred 2 implemented under bigsnpr version 1.4.7

Note

For more details, please refer to LDpred 2's homepage

You can install LDpred and its dependencies in R with the following command:

install.packages("remotes")
library(remotes)
remotes::install_github("https://github.com/privefl/bigsnpr.git")

Note

For mac users, you might need to follow the guide here to be able to install LDpred2

Required Data¶

We assume that you have the following files (or you can download it from here):

File Name	Description
Height.QC.gz	The post-QCed summary statistic
EUR.QC.bed	The genotype file after performing some basic filtering
EUR.QC.bim	This file contains the SNPs that passed the basic filtering
EUR.QC.fam	This file contains the samples that passed the basic filtering
EUR.height	This file contains the phenotype of the samples
EUR.cov	This file contains the covariates of the samples
EUR.eigenvec	This file contains the PCs of the samples

Warning

While we do provide a rough guide on how to perform LDpred on bed files separated into individual chromosomes, this script is untested and extra caution is required

0. Prepare workspace¶

On some server, you might need to first use the following code in order to run LDpred with multi-thread

prepare workspace and load bigsnpr

library(bigsnpr)
options(bigstatsr.check.parallel.blas = FALSE)
options(default.nproc.blas = NULL)

1. Read in the phenotype and covariate files¶

read in phenotype and covariates

library(data.table)
library(magrittr)
phenotype <- fread("EUR.height")
covariate <- fread("EUR.cov")
pcs <- fread("EUR.eigenvec")
# rename columns
colnames(pcs) <- c("FID","IID", paste0("PC",1:6))
# generate required table
pheno <- merge(phenotype, covariate) %>%
    merge(., pcs)

2. Obtain HapMap3 SNPs¶

LDpred2 authors recommend restricting the analysis to only the HapMap3 SNPs

load HapMap3 SNPs

info <- readRDS(runonce::download_file(
  "https://ndownloader.figshare.com/files/25503788",
  fname = "map_hm3_ldpred2.rds"))

3. Load and transform the summary statistic file¶

Load summary statistic file

# Read in the summary statistic file
sumstats <- bigreadr::fread2("Height.QC.gz") 
# LDpred 2 require the header to follow the exact naming
names(sumstats) <-
    c("chr",
    "pos",
    "rsid",
    "a1",
    "a0",
    "n_eff",
    "beta_se",
    "p",
    "OR",
    "INFO",
    "MAF")
# Transform the OR into log(OR)
sumstats$beta <- log(sumstats$OR)
# Filter out hapmap SNPs
sumstats <- sumstats[sumstats$rsid%in% info$rsid,]

Warning

Here, we know the exact ordering of the summary statistics file. However, in many cases, the ordering of the summary statistics differ, thus one must rename the columns according to their actual ordering

3. Calculate the LD matrix¶

Genome Wide bed file

# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file 
fam.order <- NULL
# preprocess the bed file (only need to do once for each data set)
snp_readBed("EUR.QC.bed")
# now attach the genotype object
obj.bigSNP <- snp_attach("EUR.QC.rds")
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
info_snp <- snp_match(sumstats, map)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
for (chr in 1:22) {
    # Extract SNPs that are included in the chromosome
    ind.chr <- which(info_snp$chr == chr)
    ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
    # Calculate the LD
    corr0 <- snp_cor(
            genotype,
            ind.col = ind.chr2,
            ncores = NCORES,
            infos.pos = POS2[ind.chr2],
            size = 3 / 1000
        )
    if (chr == 1) {
        ld <- Matrix::colSums(corr0^2)
        corr <- as_SFBM(corr0, tmp)
    } else {
        ld <- c(ld, Matrix::colSums(corr0^2))
        corr$add_columns(corr0, nrow(corr))
    }
}
# We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
# Rename fam order
setnames(fam.order,
        c("family.ID", "sample.ID"),
        c("FID", "IID"))

Chromosome separated bed files

# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file 
info_snp <- NULL
fam.order <- NULL
for (chr in 1:22) {
    # preprocess the bed file (only need to do once for each data set)
    # Assuming the file naming is EUR_chr#.bed
    snp_readBed(paste0("EUR_chr",chr,".bed"))
    # now attach the genotype object
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
    # extract the SNP information from the genotype
    map <- obj.bigSNP$map[-3]
    names(map) <- c("chr", "rsid", "pos", "a1", "a0")
    # perform SNP matching
    tmp_snp <- snp_match(sumstats[sumstats$chr==chr,], map)
    info_snp <- rbind(info_snp, tmp_snp)
    # Assign the genotype to a variable for easier downstream analysis
    genotype <- obj.bigSNP$genotypes
    # Rename the data structures
    CHR <- map$chr
    POS <- map$pos
    # get the CM information from 1000 Genome
    # will download the 1000G file to the current directory (".")
    POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
    # calculate LD
    # Extract SNPs that are included in the chromosome
    ind.chr <- which(tmp_snp$chr == chr)
    ind.chr2 <- tmp_snp$`_NUM_ID_`[ind.chr]
    # Calculate the LD
    corr0 <- snp_cor(
            genotype,
            ind.col = ind.chr2,
            ncores = NCORES,
            infos.pos = POS2[ind.chr2],
            size = 3 / 1000
        )
    if (chr == 1) {
        ld <- Matrix::colSums(corr0^2)
        corr <- as_SFBM(corr0, tmp)
    } else {
        ld <- c(ld, Matrix::colSums(corr0^2))
        corr$add_columns(corr0, nrow(corr))
    }
    # We assume the fam order is the same across different chromosomes
    if(is.null(fam.order)){
        fam.order <- as.data.table(obj.bigSNP$fam)
    }
}
# Rename fam order
setnames(fam.order,
        c("family.ID", "sample.ID"),
        c("FID", "IID"))

4. Perform LD score regression¶

Perform LD score regression

df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc(   ld, 
                    length(ld), 
                    chi2 = (df_beta$beta / df_beta$beta_se)^2,
                    sample_size = df_beta$n_eff, 
                    blocks = NULL)
h2_est <- ldsc[["h2"]]

5. Calculate the null R2¶

Calculate the null R2 (quantitative trait)

# Reformat the phenotype file such that y is of the same order as the 
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait 
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~Sex+", .) %>%
    as.formula %>%
    lm(., data = y) %>%
    summary
null.r2 <- null.model$r.squared

Calculate the null R2 (binary trait)

library(fmsb)
# Reformat the phenotype file such that y is of the same order as the 
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait 
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~Sex+", .) %>%
    as.formula %>%
    glm(., data = y, family=binomial) %>%
    summary
null.r2 <- fmsb::NagelkerkeR2(null.model)

Important

Scripts for binary trait analysis only serve as a reference as we have not simulate any binary traits. In addition, Nagelkerke \(R^2\) is biased when there are ascertainment of samples. For more information, please refer to this paper

infinitesimal model

beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)

grid model

# Prepare data for grid model
p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
grid.param <-
    expand.grid(p = p_seq,
            h2 = h2_seq,
            sparse = c(FALSE, TRUE))
# Get adjusted beta from grid model
beta_grid <-
    snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)

auto model

# Get adjusted beta from the auto model
multi_auto <- snp_ldpred2_auto(
    corr,
    df_beta,
    h2_init = h2_est,
    vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES),
    ncores = NCORES
)
beta_auto <- sapply(multi_auto, function(auto)
    auto$beta_est)

7. Obtain model PRS¶

Using Genome wide bed file¶

infinitesimal model

if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_inf <- big_prodVec(    genotype,
                            beta_inf,
                            ind.row = ind.test,
                            ind.col = info_snp$`_NUM_ID_`)

grid model

if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_grid <- big_prodMat(   genotype, 
                            beta_grid, 
                            ind.col = info_snp$`_NUM_ID_`)

auto model

if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_auto <-
    big_prodMat(genotype,
                beta_auto,
                ind.row = ind.test,
                ind.col = info_snp$`_NUM_ID_`)
# scale the PRS generated from AUTO
pred_scaled <- apply(pred_auto, 2, sd)
final_beta_auto <-
    rowMeans(beta_auto[,
                abs(pred_scaled -
                    median(pred_scaled)) <
                    3 * mad(pred_scaled)])
pred_auto <-
    big_prodVec(genotype,
        final_beta_auto,
        ind.row = ind.test,
        ind.col = info_snp$`_NUM_ID_`)

Using chromosome separated bed files¶

infinitesimal model

pred_inf <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]
    tmp <- big_prodVec(genotype,
                            beta_inf,
                            ind.row = ind.test,
                            ind.col = ind.chr)
    if(is.null(pred_inf)){
        pred_inf <- tmp
    }else{
        pred_inf <- pred_inf + tmp
    }
}

grid model

pred_grid <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]

    tmp <- big_prodMat( genotype, 
                        beta_grid, 
                        ind.col = ind.chr)

    if(is.null(pred_grid)){
        pred_grid <- tmp
    }else{
        pred_grid <- pred_grid + tmp
    }
}

auto model

pred_auto <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]

    tmp <-
        big_prodMat(genotype,
                    beta_auto,
                    ind.row = ind.test,
                    ind.col = ind.chr)
    # scale the PRS generated from AUTO
    pred_scaled <- apply(tmp, 2, sd)
    final_beta_auto <-
        rowMeans(tmp[chr.idx,
                    abs(pred_scaled -
                        median(pred_scaled)) <
                        3 * mad(pred_scaled)])
    tmp <-
        big_prodVec(genotype,
            final_beta_auto,
            ind.row = ind.test,
            ind.col = ind.chr)
    if(is.null(pred_auto)){
        pred_auto <- tmp
    }else{
        pred_auto <- pred_auto + tmp
    }
}

8. Get the final performance of the LDpred models¶

infinitesimal model

reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
reg.dat$PRS <- pred_inf
inf.model <- lm(reg.formula, dat=reg.dat) %>%
    summary
(result <- data.table(
    infinitesimal = inf.model$r.squared - null.r2,
    null = null.r2
))

grid model

reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
max.r2 <- 0
for(i in 1:ncol(pred_grid)){
    reg.dat$PRS <- pred_grid[,i]
    grid.model <- lm(reg.formula, dat=reg.dat) %>%
        summary  
    if(max.r2 < grid.model$r.squared){
        max.r2 <- grid.model$r.squared
    }
}
(result <- data.table(
    grid = max.r2 - null.r2,
    null = null.r2
))

auto model

reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
reg.dat$PRS <- pred_auto
auto.model <- lm(reg.formula, dat=reg.dat) %>%
    summary
(result <- data.table(
    auto = auto.model$r.squared - null.r2,
    null = null.r2
))

How much phenotypic variation does the PRS from each model explain?

Infinitesimal = 0.0248696

Grid Model = 0.001746926

Auto Model = 0.1751478