Background

LDpred-2 is one of the dedicated PRS programs which is an R package that uses a Bayesian approach to polygenic risk scoring.

Installing LDpred-2

Note

The script used here is based on LDpred 2 implemented under bigsnpr version 1.4.7

Note

For more details, please refer to LDpred 2's homepage

You can install LDpred and its dependencies in R with the following command:

install.packages("remotes")
library(remotes)
remotes::install_github("https://github.com/privefl/bigsnpr.git")

Note

For mac users, you might need to follow the guide here to be able to install LDpred2

Required Data

We assume that you have the following files (or you can download it from here):

File Name Description
Height.QC.gz The post-QCed summary statistic
EUR.QC.bed The genotype file after performing some basic filtering
EUR.QC.bim This file contains the SNPs that passed the basic filtering
EUR.QC.fam This file contains the samples that passed the basic filtering
EUR.height This file contains the phenotype of the samples
EUR.cov This file contains the covariates of the samples
EUR.eigenvec This file contains the PCs of the samples

Warning

While we do provide a rough guide on how to perform LDpred on bed files separated into individual chromosomes, this script is untested and extra caution is required

0. Prepare workspace

On some server, you might need to first use the following code in order to run LDpred with multi-thread

library(bigsnpr)
options(bigstatsr.check.parallel.blas = FALSE)
options(default.nproc.blas = NULL)

1. Read in the phenotype and covariate files

library(data.table)
library(magrittr)
phenotype <- fread("EUR.height")
covariate <- fread("EUR.cov")
pcs <- fread("EUR.eigenvec")
# rename columns
colnames(pcs) <- c("FID","IID", paste0("PC",1:6))
# generate required table
pheno <- merge(phenotype, covariate) %>%
    merge(., pcs)

2. Obtain HapMap3 SNPs

LDpred2 authors recommend restricting the analysis to only the HapMap3 SNPs

info <- readRDS(runonce::download_file(
  "https://ndownloader.figshare.com/files/25503788",
  fname = "map_hm3_ldpred2.rds"))

3. Load and transform the summary statistic file

# Read in the summary statistic file
sumstats <- bigreadr::fread2("Height.QC.gz") 
# LDpred 2 require the header to follow the exact naming
names(sumstats) <-
    c("chr",
    "pos",
    "rsid",
    "a1",
    "a0",
    "n_eff",
    "beta_se",
    "p",
    "OR",
    "INFO",
    "MAF")
# Transform the OR into log(OR)
sumstats$beta <- log(sumstats$OR)
# Filter out hapmap SNPs
sumstats <- sumstats[sumstats$rsid%in% info$rsid,]

Warning

Here, we know the exact ordering of the summary statistics file. However, in many cases, the ordering of the summary statistics differ, thus one must rename the columns according to their actual ordering

3. Calculate the LD matrix

# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file 
fam.order <- NULL
# preprocess the bed file (only need to do once for each data set)
snp_readBed("EUR.QC.bed")
# now attach the genotype object
obj.bigSNP <- snp_attach("EUR.QC.rds")
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
info_snp <- snp_match(sumstats, map)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
for (chr in 1:22) {
    # Extract SNPs that are included in the chromosome
    ind.chr <- which(info_snp$chr == chr)
    ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
    # Calculate the LD
    corr0 <- snp_cor(
            genotype,
            ind.col = ind.chr2,
            ncores = NCORES,
            infos.pos = POS2[ind.chr2],
            size = 3 / 1000
        )
    if (chr == 1) {
        ld <- Matrix::colSums(corr0^2)
        corr <- as_SFBM(corr0, tmp)
    } else {
        ld <- c(ld, Matrix::colSums(corr0^2))
        corr$add_columns(corr0, nrow(corr))
    }
}
# We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
# Rename fam order
setnames(fam.order,
        c("family.ID", "sample.ID"),
        c("FID", "IID"))
# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file 
info_snp <- NULL
fam.order <- NULL
for (chr in 1:22) {
    # preprocess the bed file (only need to do once for each data set)
    # Assuming the file naming is EUR_chr#.bed
    snp_readBed(paste0("EUR_chr",chr,".bed"))
    # now attach the genotype object
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
    # extract the SNP information from the genotype
    map <- obj.bigSNP$map[-3]
    names(map) <- c("chr", "rsid", "pos", "a1", "a0")
    # perform SNP matching
    tmp_snp <- snp_match(sumstats[sumstats$chr==chr,], map)
    info_snp <- rbind(info_snp, tmp_snp)
    # Assign the genotype to a variable for easier downstream analysis
    genotype <- obj.bigSNP$genotypes
    # Rename the data structures
    CHR <- map$chr
    POS <- map$pos
    # get the CM information from 1000 Genome
    # will download the 1000G file to the current directory (".")
    POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
    # calculate LD
    # Extract SNPs that are included in the chromosome
    ind.chr <- which(tmp_snp$chr == chr)
    ind.chr2 <- tmp_snp$`_NUM_ID_`[ind.chr]
    # Calculate the LD
    corr0 <- snp_cor(
            genotype,
            ind.col = ind.chr2,
            ncores = NCORES,
            infos.pos = POS2[ind.chr2],
            size = 3 / 1000
        )
    if (chr == 1) {
        ld <- Matrix::colSums(corr0^2)
        corr <- as_SFBM(corr0, tmp)
    } else {
        ld <- c(ld, Matrix::colSums(corr0^2))
        corr$add_columns(corr0, nrow(corr))
    }
    # We assume the fam order is the same across different chromosomes
    if(is.null(fam.order)){
        fam.order <- as.data.table(obj.bigSNP$fam)
    }
}
# Rename fam order
setnames(fam.order,
        c("family.ID", "sample.ID"),
        c("FID", "IID"))

4. Perform LD score regression

df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc(   ld, 
                    length(ld), 
                    chi2 = (df_beta$beta / df_beta$beta_se)^2,
                    sample_size = df_beta$n_eff, 
                    blocks = NULL)
h2_est <- ldsc[["h2"]]

5. Calculate the null R2

# Reformat the phenotype file such that y is of the same order as the 
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait 
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~Sex+", .) %>%
    as.formula %>%
    lm(., data = y) %>%
    summary
null.r2 <- null.model$r.squared
library(fmsb)
# Reformat the phenotype file such that y is of the same order as the 
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait 
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~Sex+", .) %>%
    as.formula %>%
    glm(., data = y, family=binomial) %>%
    summary
null.r2 <- fmsb::NagelkerkeR2(null.model)

Important

Scripts for binary trait analysis only serve as a reference as we have not simulate any binary traits. In addition, Nagelkerke \(R^2\) is biased when there are ascertainment of samples. For more information, please refer to this paper

beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
# Prepare data for grid model
p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
grid.param <-
    expand.grid(p = p_seq,
            h2 = h2_seq,
            sparse = c(FALSE, TRUE))
# Get adjusted beta from grid model
beta_grid <-
    snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
# Get adjusted beta from the auto model
multi_auto <- snp_ldpred2_auto(
    corr,
    df_beta,
    h2_init = h2_est,
    vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES),
    ncores = NCORES
)
beta_auto <- sapply(multi_auto, function(auto)
    auto$beta_est)

7. Obtain model PRS

Using Genome wide bed file

if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_inf <- big_prodVec(    genotype,
                            beta_inf,
                            ind.row = ind.test,
                            ind.col = info_snp$`_NUM_ID_`)
if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_grid <- big_prodMat(   genotype, 
                            beta_grid, 
                            ind.col = info_snp$`_NUM_ID_`)
if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_auto <-
    big_prodMat(genotype,
                beta_auto,
                ind.row = ind.test,
                ind.col = info_snp$`_NUM_ID_`)
# scale the PRS generated from AUTO
pred_scaled <- apply(pred_auto, 2, sd)
final_beta_auto <-
    rowMeans(beta_auto[,
                abs(pred_scaled -
                    median(pred_scaled)) <
                    3 * mad(pred_scaled)])
pred_auto <-
    big_prodVec(genotype,
        final_beta_auto,
        ind.row = ind.test,
        ind.col = info_snp$`_NUM_ID_`)

Using chromosome separated bed files

pred_inf <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]
    tmp <- big_prodVec(genotype,
                            beta_inf,
                            ind.row = ind.test,
                            ind.col = ind.chr)
    if(is.null(pred_inf)){
        pred_inf <- tmp
    }else{
        pred_inf <- pred_inf + tmp
    }
}
pred_grid <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]

    tmp <- big_prodMat( genotype, 
                        beta_grid, 
                        ind.col = ind.chr)

    if(is.null(pred_grid)){
        pred_grid <- tmp
    }else{
        pred_grid <- pred_grid + tmp
    }
}
pred_auto <- NULL
for(chr in 1:22){
    obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
    genotype <- obj.bigSNP$genotypes
    # calculate PRS for all samples
    ind.test <- 1:nrow(genotype)
    # Extract SNPs in this chromosome
    chr.idx <- which(info_snp$chr == chr)
    ind.chr <- info_snp$`_NUM_ID_`[chr.idx]

    tmp <-
        big_prodMat(genotype,
                    beta_auto,
                    ind.row = ind.test,
                    ind.col = ind.chr)
    # scale the PRS generated from AUTO
    pred_scaled <- apply(tmp, 2, sd)
    final_beta_auto <-
        rowMeans(tmp[chr.idx,
                    abs(pred_scaled -
                        median(pred_scaled)) <
                        3 * mad(pred_scaled)])
    tmp <-
        big_prodVec(genotype,
            final_beta_auto,
            ind.row = ind.test,
            ind.col = ind.chr)
    if(is.null(pred_auto)){
        pred_auto <- tmp
    }else{
        pred_auto <- pred_auto + tmp
    }
}

8. Get the final performance of the LDpred models

reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
reg.dat$PRS <- pred_inf
inf.model <- lm(reg.formula, dat=reg.dat) %>%
    summary
(result <- data.table(
    infinitesimal = inf.model$r.squared - null.r2,
    null = null.r2
))
reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
max.r2 <- 0
for(i in 1:ncol(pred_grid)){
    reg.dat$PRS <- pred_grid[,i]
    grid.model <- lm(reg.formula, dat=reg.dat) %>%
        summary  
    if(max.r2 < grid.model$r.squared){
        max.r2 <- grid.model$r.squared
    }
}
(result <- data.table(
    grid = max.r2 - null.r2,
    null = null.r2
))
reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
    paste0("Height~PRS+Sex+", .) %>%
    as.formula
reg.dat <- y
reg.dat$PRS <- pred_auto
auto.model <- lm(reg.formula, dat=reg.dat) %>%
    summary
(result <- data.table(
    auto = auto.model$r.squared - null.r2,
    null = null.r2
))
How much phenotypic variation does the PRS from each model explain?

Infinitesimal = 0.0248696

Grid Model = 0.001746926

Auto Model = 0.1751478