CAD_autophagy.R 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. # apolipoprotein L, 1
  2. # apolipoprotein L, 1
  3. # arylsulfatase B
  4. # Baculoviral IAP repeat-containing protein 5
  5. # BCL2/adenovirus E1B 19 kDa protein-interacting protein 3
  6. # calnexin
  7. # Caspase 8 levels
  8. # Caspase-3
  9. # C-C motif chemokine ligand 2
  10. # Cyclin-dependent kinase inhibitor 1B
  11. # Cathepsin L1 levels
  12. # cathepsin D
  13. # Cathepsin B
  14. # Cathepsin L1 levels
  15. # C-X3-C motif chemokine ligand 1
  16. # Death-associated protein kinase 2
  17. # DnaJ homolog subfamily B member 9
  18. # Glutamate receptor ionotropic, delta-2
  19. # Interferon gamma
  20. # Gamma-aminobutyric acid receptor-associated protein-like 1
  21. # Gamma-aminobutyric acid receptor-associated protein-like 2
  22. # Glutamate receptor ionotropic, delta-2
  23. # Mitogen-activated protein kinase 1
  24. # Mitogen-activated protein kinase 3
  25. # Mitogen-activated protein kinase 8
  26. # Nicotinamide phosphoribosyltransferase
  27. # Next to BRCA1 gene 1 protein
  28. # PTK6
  29. # sphingosine kinase 1
  30. # Vesicle-associated membrane protein 3
  31. # vascular endothelial growth factor A
  32. #在线孟德尔分析
  33. #加载包
  34. library(TwoSampleMR)
  35. # install.packages("ieugwasr")
  36. # install.packages("tidyverse", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
  37. library(ieugwasr)
  38. # 登陆token
  39. Sys.setenv(OPENGWAS_JWT="eyJhbGciOiJSUzI1NiIsImtpZCI6ImFwaS1qd3QiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJhcGkub3Blbmd3YXMuaW8iLCJhdWQiOiJhcGkub3Blbmd3YXMuaW8iLCJzdWIiOiJ6anQ3ODU2MzIxQGhvdG1haWwuY29tIiwiaWF0IjoxNzI0NTg2NzM3LCJleHAiOjE3MjU3OTYzMzd9.hseY1Y4YiYJ4WWi-WoQmm0VWQA6Ozqurj08HvJV8q6xh7D4Jfv5o3zdwBvU_4HhC87C59KWgXaBm5ZiMynqLSp5uFLlai_5ObswpbdLHtNRghZYti0wF7TRTGcXPa9_ICWvrEoIRNoAyj10I1nOrPE9RhRL0PrzHOYb9Y3L7oBDzmsch-boFLYE3R9ZhUoZSMMiZYNR9_PmYbJi2Lrd6C_tI1fCna9jzF5vHsQQReYgy4IS59rr_lDEwdE46dCIq2cHoDIZJ4ohArPHtoLaw5CXf0yUrY89Xo2J5KnVWHtzlFK7EEgOIpx1DX6mSgrVxXtvlFSM-j7K1qbM9Awr6JQ")
  40. ieugwasr::user()
  41. Coronary_artery_disease_list = c("ebi-a-GCST005195", "ebi-a-GCST005194")
  42. autoghagy_list <- c("prot-a-134", "prot-a-135", "prot-a-172", "prot-a-252" , "prot-a-261",
  43. "prot-a-353", "ebi-a-GCST90012072", "prot-a-364", "prot-b-4", "prot-a-495",
  44. "ebi-a-GCST90012073", "prot-b-51", "prot-a-718", "ebi-a-GCST90012073", "prot-b-70",
  45. "prot-a-763", "prot-a-842", "prot-a-1276", "prot-a-1428", "prot-a-1161",
  46. "prot-a-1162", "prot-a-1276","prot-a-1845", "prot-a-1848", "prot-a-1849",
  47. "prot-a-1998", "prot-a-2004","prot-a-2536","ebi-a-GCST90000502")
  48. result_all <- data.frame()
  49. for (i in seq_along(Coronary_artery_disease_list)) {
  50. for (j in seq_along(autoghagy_list)) {
  51. #两样本MR(在线读)
  52. #提取蛋白的SNP
  53. print(autoghagy_list[j])
  54. autoghagy <- extract_instruments(outcomes = autoghagy_list[j])
  55. #提取SNP在结局中的信息
  56. print(Coronary_artery_disease_list[i])
  57. outcome_dat <- extract_outcome_data(snps = autoghagy$SNP, outcomes = Coronary_artery_disease_list[i])
  58. #合并暴露和结局的数据
  59. dat <- harmonise_data(autoghagy, outcome_dat)
  60. #分析MR
  61. result <- mr(dat)
  62. result_all <- rbind(result_all, result)
  63. }
  64. }
  65. # 保存为 CSV 文件
  66. write.csv(result_all, file = "result_all.csv", row.names = FALSE)
  67. a = read.csv("result_all.csv")
  68. autoghagy <- extract_instruments(outcomes = "ebi-a-GCST90000502")
  69. #提取SNP在结局中的信息
  70. print(Coronary_artery_disease_list[i])
  71. outcome_dat <- extract_outcome_data(snps = autoghagy$SNP, outcomes = "ebi-a-GCST005194")
  72. #合并暴露和结局的数据
  73. dat <- harmonise_data(autoghagy, outcome_dat)
  74. #分析MR
  75. result <- mr(dat)
  76. # #由于结局是二分类,所以生成OR值
  77. # OR <- generate_odds_ratios(results)
  78. # # 异质性检验
  79. # heterogeneity <- mr_heterogeneity(dat)
  80. # res_MRPRESSO <- run_mr_presso(dat)
  81. # # 多效性检验
  82. # pleiotropy <- mr_pleiotropy_test(dat)
  83. # pleiotropy
  84. # #散点图
  85. # mr_scatter_plot(results, dat)
  86. # #留一图
  87. # leaveoneout <- mr_leaveoneout(dat)
  88. # mr_leaveoneout_plot(leaveoneout)
  89. # #森林图
  90. # results_single <- mr_singlesnp(dat)
  91. # mr_forest_plot(results_single)
  92. # #漏斗图
  93. # mr_funnel_plot(results_single)
  94. ######################################################################
  95. ######################################################################
  96. # 本地文件孟德尔分析
  97. # Major coronary heart disease event
  98. # Coronary atherosclerosis
  99. # install.packages("vroom")
  100. #加载包
  101. library(TwoSampleMR)
  102. library(vroom)
  103. library(tidyverse)
  104. library(tools)
  105. library(ieugwasr)
  106. setwd("/root/r_base")
  107. #读取eaf文件
  108. eaf_dat_ori <- vroom("protein_data/eaf_data/assocvariants.annotated.txt.gz", col_names = TRUE)
  109. eaf_dat = eaf_dat_ori[, c(3,7)]
  110. colnames(eaf_dat)
  111. # 指定文件夹路径
  112. folder_path <- "protein_data"
  113. # 获取所有.txt.gz文件
  114. file_list <- list.files(path = folder_path, pattern = "\\.txt\\.gz$", full.names = TRUE)
  115. # 遍历每个文件
  116. for (file in file_list) {
  117. # 输出当前处理的文件名
  118. file_name = file_path_sans_ext(file_path_sans_ext(basename(file)))
  119. #检查是否已经处理过
  120. if (file.exists(paste0("protein_rds_data/",file_name, ".rds"))) {
  121. print("文件存在")
  122. next
  123. }
  124. print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file)))))
  125. #读取暴露
  126. expo_dat_ori <- vroom(file, col_names = TRUE)
  127. # 暴露p < 5e-08过滤一下,强相关
  128. print("=========暴露p < 5e-08过滤一下,强相关=================")
  129. expo_dat <- subset(expo_dat_ori, expo_dat_ori$Pval<5e-08)
  130. #eaf填充
  131. print("=========eaf填充=================")
  132. expo_dat <- left_join(expo_dat, eaf_dat, by = "Name")
  133. print(colnames(expo_dat))
  134. print(dim(expo_dat))
  135. #标准化列名
  136. expo_dat = expo_dat[, c(1,2,4,5,6,7,8,10,11,13)]
  137. colnames(expo_dat) = c("chr", "pos", "SNP", "effect_allele", "other_allele", "beta", "pval", "se", "samplesize", "eaf")
  138. expo_dat$phenotype_col <- strsplit(file_name, "_")[[1]][3]
  139. #进行格式转换
  140. print("=========格式转换=================")
  141. expo_dat <- TwoSampleMR::format_data(
  142. expo_dat,
  143. type = "exposure",
  144. phenotype_col = "phenotype_col"
  145. )
  146. print(colnames(expo_dat))
  147. print(dim(expo_dat))
  148. #去除连锁不平衡
  149. print("=========去除连锁不平衡=================")
  150. # expo_dat_clump <- clump_data(expo_dat, clump_kb = 10000, clump_r2 = 0.001)
  151. exposure_data_clumped <- ld_clump(
  152. # 指定包含`rsid`、`pval`和`id`的数据框
  153. dat = tibble(rsid = expo_dat$SNP, # 从 exposure_data 中提取 SNP 标识符
  154. pval = expo_dat$pval.exposure, # 从 exposure_data 中提取关联 p 值
  155. id = expo_dat$exposure), # 从 exposure_data 中提取 SNP 的 id
  156. clump_kb = 10000, # 设置 clumping 窗口大小
  157. clump_r2 = 0.001, # 指定用于 LD 的 r^2 阈值
  158. clump_p = 1, # 设置 p 值阈值为 1,这样所有的 SNP 都会被纳入 clump,因为咱们前面已经进行过 p 值过滤了嘛
  159. bfile = "clump_file/EUR", # 指定 LD 的参考数据集,注意修改你的路径
  160. plink_bin = "clump_file/plink" # 使用指定目录下的 plink 二进制文件,注意修改你的路径
  161. )
  162. exposure_data_clumped <- expo_dat %>% filter(expo_dat$SNP %in% exposure_data_clumped$rsid)
  163. print(colnames(exposure_data_clumped))
  164. print(dim(exposure_data_clumped))
  165. #保存为rds
  166. saveRDS(exposure_data_clumped, paste0("protein_rds_data/",file_name, ".rds"))
  167. }
  168. # 读取结局
  169. outcome_dat_ori <- vroom("CHD/finngen_R11_I9_CORATHER.gz", col_names = TRUE)
  170. colnames(outcome_dat_ori)
  171. # 标准化列名
  172. outcome_dat_ori = outcome_dat_ori[, c(1,2,3,4,5,7,9,10,11)]
  173. colnames(outcome_dat_ori) = c("chr", "pos", "other_allele", "effect_allele","SNP", "pval", "beta", "se","eaf")
  174. outcome_dat_ori$phenotype_col <- "finngen_R11_I9_CORATHER"
  175. # 循环进行等位基因对齐
  176. # 指定文件夹路径
  177. folder_rds_path <- "protein_rds_data"
  178. # 获取所有.txt.gz文件
  179. file_rds_list <- list.files(path = folder_rds_path, pattern = "\\.rds$", full.names = TRUE)
  180. # 遍历每个文件进行等位基因对齐
  181. for (file_rds in file_rds_list) {
  182. # 输出当前处理的文件名
  183. file_name = file_path_sans_ext(file_path_sans_ext(basename(file_rds)))
  184. print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file_rds)))))
  185. #读取rds
  186. exposure_data_clumped = readRDS(file_rds)
  187. #暴露和结局取交集,确定哪些SNP是两者共有的
  188. print("=========暴露和结局取交集=================")
  189. data_common <- merge(exposure_data_clumped, outcome_dat_ori, by = "SNP")
  190. print(dim(data_common))
  191. print(colnames(data_common))
  192. if (nrow(data_common) > 0) {
  193. # 提取结局数据,并进行格式修改,以用于后续的 MR 分析
  194. outcome_dat <- format_data(outcome_dat_ori,
  195. # 设置类型为 "outcome",表示这是一个结局数据
  196. type = "outcome",
  197. # 选择用于 MR 分析的 SNP,这里使用了 data_common 数据集中的 SNP 列
  198. snps = data_common$SNP,
  199. # 设置 SNP 列的名称
  200. snp_col = "SNP",
  201. # 设置效应大小 (beta) 的列名称
  202. beta_col = "beta",
  203. # 设置标准误差 (SE) 的列名称
  204. se_col = "se",
  205. # 设置效应等位基因频率 (EAF) 的列名称
  206. eaf_col = "eaf",
  207. # 设置效应等位基因的列名称
  208. effect_allele_col = "effect_allele",
  209. # 设置非效应等位基因的列名称
  210. other_allele_col = "other_allele",
  211. # 设置 p 值 (p-value) 的列名称
  212. pval_col = "pval",
  213. # 设置样本大小 (sample size) 的列名称
  214. samplesize_col = "samplesize",
  215. # 设置病例数 (number of cases) 的列名称
  216. ncase_col = "ncase",
  217. # 设置 id 的列名称
  218. id_col = "id",
  219. phenotype_col = "phenotype_col")
  220. #等位基因对齐
  221. print("=========等位基因对齐=================")
  222. dat <- harmonise_data(exposure_dat = exposure_data_clumped, outcome_dat = outcome_dat)
  223. print(dim(dat))
  224. print(colnames(dat))
  225. #保存为rds
  226. saveRDS(dat, paste0("harmonise_data_CORATHER/",file_name, ".rds"))
  227. } else {
  228. print("数据条数为 0")
  229. }
  230. }
  231. # 循环进行MR分析
  232. # 指定文件夹路径
  233. folder_rds_path_m <- "harmonise_data_CHD"
  234. # 获取所有.txt.gz文件
  235. file_rds_list_m <- list.files(path = folder_rds_path_m, pattern = "\\.rds$", full.names = TRUE)
  236. # 遍历每个文件进行mr分析
  237. result_all <- data.frame()
  238. for (file_rds_m in file_rds_list_m) {
  239. #读取rds
  240. dat = readRDS(file_rds_m)
  241. #去掉结局相关SNP
  242. dat <- subset(dat, dat$pval.outcome>=5e-08)
  243. print("=========MR 分析=================")
  244. # MR 分析
  245. result <- mr(dat)
  246. result_all <- rbind(result_all, result)
  247. }
  248. # 保存为 CSV 文件
  249. write.csv(result_all, file = "result_all_CHD.csv", row.names = FALSE)
  250. a = read.csv("result_all_CHD.csv")
  251. View(a)
  252. # b = read.csv("result_all_CHD.csv")
  253. # View(b)
  254. #单独跑MR分析
  255. dat = readRDS("harmonise_data_CORATHER_result/18387_7_ST13_suppression_of_tumorigenicity_13.rds")
  256. dat <- subset(dat, dat$pval.outcome>=5e-08)
  257. result <- mr(dat)
  258. View(result)
  259. #生成OR值
  260. generate_odds_ratios(result)
  261. # 异质性检验
  262. mr_heterogeneity(dat)
  263. # 水平多效性检验
  264. mr_pleiotropy_test(dat)
  265. # 散点图
  266. p1 <- mr_scatter_plot(result, dat)
  267. p1
  268. # 森林图
  269. result_single <- mr_singlesnp(dat)
  270. p2 <- mr_forest_plot(result_single)
  271. p2
  272. # 留一图
  273. result_loo <- mr_leaveoneout(dat)
  274. p3 <- mr_leaveoneout_plot(result_loo)
  275. p3
  276. # 漏斗图
  277. result_single <- mr_singlesnp(dat)
  278. p4 <- mr_funnel_plot(result_single)
  279. p4