# apolipoprotein L, 1 # apolipoprotein L, 1 # arylsulfatase B # Baculoviral IAP repeat-containing protein 5 # BCL2/adenovirus E1B 19 kDa protein-interacting protein 3 # calnexin # Caspase 8 levels # Caspase-3 # C-C motif chemokine ligand 2 # Cyclin-dependent kinase inhibitor 1B # Cathepsin L1 levels # cathepsin D # Cathepsin B # Cathepsin L1 levels # C-X3-C motif chemokine ligand 1 # Death-associated protein kinase 2 # DnaJ homolog subfamily B member 9 # Glutamate receptor ionotropic, delta-2 # Interferon gamma # Gamma-aminobutyric acid receptor-associated protein-like 1 # Gamma-aminobutyric acid receptor-associated protein-like 2 # Glutamate receptor ionotropic, delta-2 # Mitogen-activated protein kinase 1 # Mitogen-activated protein kinase 3 # Mitogen-activated protein kinase 8 # Nicotinamide phosphoribosyltransferase # Next to BRCA1 gene 1 protein # PTK6 # sphingosine kinase 1 # Vesicle-associated membrane protein 3 # vascular endothelial growth factor A #在线孟德尔分析 #加载包 library(TwoSampleMR) # install.packages("ieugwasr") # install.packages("tidyverse", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/") library(ieugwasr) # 登陆token Sys.setenv(OPENGWAS_JWT="eyJhbGciOiJSUzI1NiIsImtpZCI6ImFwaS1qd3QiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJhcGkub3Blbmd3YXMuaW8iLCJhdWQiOiJhcGkub3Blbmd3YXMuaW8iLCJzdWIiOiJ6anQ3ODU2MzIxQGhvdG1haWwuY29tIiwiaWF0IjoxNzI0NTg2NzM3LCJleHAiOjE3MjU3OTYzMzd9.hseY1Y4YiYJ4WWi-WoQmm0VWQA6Ozqurj08HvJV8q6xh7D4Jfv5o3zdwBvU_4HhC87C59KWgXaBm5ZiMynqLSp5uFLlai_5ObswpbdLHtNRghZYti0wF7TRTGcXPa9_ICWvrEoIRNoAyj10I1nOrPE9RhRL0PrzHOYb9Y3L7oBDzmsch-boFLYE3R9ZhUoZSMMiZYNR9_PmYbJi2Lrd6C_tI1fCna9jzF5vHsQQReYgy4IS59rr_lDEwdE46dCIq2cHoDIZJ4ohArPHtoLaw5CXf0yUrY89Xo2J5KnVWHtzlFK7EEgOIpx1DX6mSgrVxXtvlFSM-j7K1qbM9Awr6JQ") ieugwasr::user() Coronary_artery_disease_list = c("ebi-a-GCST005195", "ebi-a-GCST005194") autoghagy_list <- c("prot-a-134", "prot-a-135", "prot-a-172", "prot-a-252" , "prot-a-261", "prot-a-353", "ebi-a-GCST90012072", "prot-a-364", "prot-b-4", "prot-a-495", "ebi-a-GCST90012073", "prot-b-51", "prot-a-718", "ebi-a-GCST90012073", "prot-b-70", "prot-a-763", "prot-a-842", "prot-a-1276", "prot-a-1428", "prot-a-1161", "prot-a-1162", "prot-a-1276","prot-a-1845", "prot-a-1848", "prot-a-1849", "prot-a-1998", "prot-a-2004","prot-a-2536","ebi-a-GCST90000502") result_all <- data.frame() for (i in seq_along(Coronary_artery_disease_list)) { for (j in seq_along(autoghagy_list)) { #两样本MR(在线读) #提取蛋白的SNP print(autoghagy_list[j]) autoghagy <- extract_instruments(outcomes = autoghagy_list[j]) #提取SNP在结局中的信息 print(Coronary_artery_disease_list[i]) outcome_dat <- extract_outcome_data(snps = autoghagy$SNP, outcomes = Coronary_artery_disease_list[i]) #合并暴露和结局的数据 dat <- harmonise_data(autoghagy, outcome_dat) #分析MR result <- mr(dat) result_all <- rbind(result_all, result) } } # 保存为 CSV 文件 write.csv(result_all, file = "result_all.csv", row.names = FALSE) a = read.csv("result_all.csv") autoghagy <- extract_instruments(outcomes = "ebi-a-GCST90000502") #提取SNP在结局中的信息 print(Coronary_artery_disease_list[i]) outcome_dat <- extract_outcome_data(snps = autoghagy$SNP, outcomes = "ebi-a-GCST005194") #合并暴露和结局的数据 dat <- harmonise_data(autoghagy, outcome_dat) #分析MR result <- mr(dat) # #由于结局是二分类,所以生成OR值 # OR <- generate_odds_ratios(results) # # 异质性检验 # heterogeneity <- mr_heterogeneity(dat) # res_MRPRESSO <- run_mr_presso(dat) # # 多效性检验 # pleiotropy <- mr_pleiotropy_test(dat) # pleiotropy # #散点图 # mr_scatter_plot(results, dat) # #留一图 # leaveoneout <- mr_leaveoneout(dat) # mr_leaveoneout_plot(leaveoneout) # #森林图 # results_single <- mr_singlesnp(dat) # mr_forest_plot(results_single) # #漏斗图 # mr_funnel_plot(results_single) ###################################################################### ###################################################################### # 本地文件孟德尔分析 # Major coronary heart disease event # Coronary atherosclerosis # install.packages("vroom") #加载包 library(TwoSampleMR) library(vroom) library(tidyverse) library(tools) library(ieugwasr) setwd("/root/r_base") #读取eaf文件 eaf_dat_ori <- vroom("protein_data/eaf_data/assocvariants.annotated.txt.gz", col_names = TRUE) eaf_dat = eaf_dat_ori[, c(3,7)] colnames(eaf_dat) # 指定文件夹路径 folder_path <- "protein_data" # 获取所有.txt.gz文件 file_list <- list.files(path = folder_path, pattern = "\\.txt\\.gz$", full.names = TRUE) # 遍历每个文件 for (file in file_list) { # 输出当前处理的文件名 file_name = file_path_sans_ext(file_path_sans_ext(basename(file))) #检查是否已经处理过 if (file.exists(paste0("protein_rds_data/",file_name, ".rds"))) { print("文件存在") next } print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file))))) #读取暴露 expo_dat_ori <- vroom(file, col_names = TRUE) # 暴露p < 5e-08过滤一下,强相关 print("=========暴露p < 5e-08过滤一下,强相关=================") expo_dat <- subset(expo_dat_ori, expo_dat_ori$Pval<5e-08) #eaf填充 print("=========eaf填充=================") expo_dat <- left_join(expo_dat, eaf_dat, by = "Name") print(colnames(expo_dat)) print(dim(expo_dat)) #标准化列名 expo_dat = expo_dat[, c(1,2,4,5,6,7,8,10,11,13)] colnames(expo_dat) = c("chr", "pos", "SNP", "effect_allele", "other_allele", "beta", "pval", "se", "samplesize", "eaf") expo_dat$phenotype_col <- strsplit(file_name, "_")[[1]][3] #进行格式转换 print("=========格式转换=================") expo_dat <- TwoSampleMR::format_data( expo_dat, type = "exposure", phenotype_col = "phenotype_col" ) print(colnames(expo_dat)) print(dim(expo_dat)) #去除连锁不平衡 print("=========去除连锁不平衡=================") # expo_dat_clump <- clump_data(expo_dat, clump_kb = 10000, clump_r2 = 0.001) exposure_data_clumped <- ld_clump( # 指定包含`rsid`、`pval`和`id`的数据框 dat = tibble(rsid = expo_dat$SNP, # 从 exposure_data 中提取 SNP 标识符 pval = expo_dat$pval.exposure, # 从 exposure_data 中提取关联 p 值 id = expo_dat$exposure), # 从 exposure_data 中提取 SNP 的 id clump_kb = 10000, # 设置 clumping 窗口大小 clump_r2 = 0.001, # 指定用于 LD 的 r^2 阈值 clump_p = 1, # 设置 p 值阈值为 1,这样所有的 SNP 都会被纳入 clump,因为咱们前面已经进行过 p 值过滤了嘛 bfile = "clump_file/EUR", # 指定 LD 的参考数据集,注意修改你的路径 plink_bin = "clump_file/plink" # 使用指定目录下的 plink 二进制文件,注意修改你的路径 ) exposure_data_clumped <- expo_dat %>% filter(expo_dat$SNP %in% exposure_data_clumped$rsid) print(colnames(exposure_data_clumped)) print(dim(exposure_data_clumped)) #保存为rds saveRDS(exposure_data_clumped, paste0("protein_rds_data/",file_name, ".rds")) } # 读取结局 outcome_dat_ori <- vroom("CHD/finngen_R11_I9_CORATHER.gz", col_names = TRUE) colnames(outcome_dat_ori) # 标准化列名 outcome_dat_ori = outcome_dat_ori[, c(1,2,3,4,5,7,9,10,11)] colnames(outcome_dat_ori) = c("chr", "pos", "other_allele", "effect_allele","SNP", "pval", "beta", "se","eaf") outcome_dat_ori$phenotype_col <- "finngen_R11_I9_CORATHER" # 循环进行等位基因对齐 # 指定文件夹路径 folder_rds_path <- "protein_rds_data" # 获取所有.txt.gz文件 file_rds_list <- list.files(path = folder_rds_path, pattern = "\\.rds$", full.names = TRUE) # 遍历每个文件进行等位基因对齐 for (file_rds in file_rds_list) { # 输出当前处理的文件名 file_name = file_path_sans_ext(file_path_sans_ext(basename(file_rds))) print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file_rds))))) #读取rds exposure_data_clumped = readRDS(file_rds) #暴露和结局取交集,确定哪些SNP是两者共有的 print("=========暴露和结局取交集=================") data_common <- merge(exposure_data_clumped, outcome_dat_ori, by = "SNP") print(dim(data_common)) print(colnames(data_common)) if (nrow(data_common) > 0) { # 提取结局数据,并进行格式修改,以用于后续的 MR 分析 outcome_dat <- format_data(outcome_dat_ori, # 设置类型为 "outcome",表示这是一个结局数据 type = "outcome", # 选择用于 MR 分析的 SNP,这里使用了 data_common 数据集中的 SNP 列 snps = data_common$SNP, # 设置 SNP 列的名称 snp_col = "SNP", # 设置效应大小 (beta) 的列名称 beta_col = "beta", # 设置标准误差 (SE) 的列名称 se_col = "se", # 设置效应等位基因频率 (EAF) 的列名称 eaf_col = "eaf", # 设置效应等位基因的列名称 effect_allele_col = "effect_allele", # 设置非效应等位基因的列名称 other_allele_col = "other_allele", # 设置 p 值 (p-value) 的列名称 pval_col = "pval", # 设置样本大小 (sample size) 的列名称 samplesize_col = "samplesize", # 设置病例数 (number of cases) 的列名称 ncase_col = "ncase", # 设置 id 的列名称 id_col = "id", phenotype_col = "phenotype_col") #等位基因对齐 print("=========等位基因对齐=================") dat <- harmonise_data(exposure_dat = exposure_data_clumped, outcome_dat = outcome_dat) print(dim(dat)) print(colnames(dat)) #保存为rds saveRDS(dat, paste0("harmonise_data_CORATHER/",file_name, ".rds")) } else { print("数据条数为 0") } } # 循环进行MR分析 # 指定文件夹路径 folder_rds_path_m <- "harmonise_data_CHD" # 获取所有.txt.gz文件 file_rds_list_m <- list.files(path = folder_rds_path_m, pattern = "\\.rds$", full.names = TRUE) # 遍历每个文件进行mr分析 result_all <- data.frame() for (file_rds_m in file_rds_list_m) { #读取rds dat = readRDS(file_rds_m) #去掉结局相关SNP dat <- subset(dat, dat$pval.outcome>=5e-08) print("=========MR 分析=================") # MR 分析 result <- mr(dat) result_all <- rbind(result_all, result) } # 保存为 CSV 文件 write.csv(result_all, file = "result_all_CHD.csv", row.names = FALSE) a = read.csv("result_all_CHD.csv") View(a) # b = read.csv("result_all_CHD.csv") # View(b) #单独跑MR分析 dat = readRDS("harmonise_data_CORATHER_result/18387_7_ST13_suppression_of_tumorigenicity_13.rds") dat <- subset(dat, dat$pval.outcome>=5e-08) result <- mr(dat) View(result) #生成OR值 generate_odds_ratios(result) # 异质性检验 mr_heterogeneity(dat) # 水平多效性检验 mr_pleiotropy_test(dat) # 散点图 p1 <- mr_scatter_plot(result, dat) p1 # 森林图 result_single <- mr_singlesnp(dat) p2 <- mr_forest_plot(result_single) p2 # 留一图 result_loo <- mr_leaveoneout(dat) p3 <- mr_leaveoneout_plot(result_loo) p3 # 漏斗图 result_single <- mr_singlesnp(dat) p4 <- mr_funnel_plot(result_single) p4