Bladeren bron

增加处理CLHLS的代码

root 3 maanden geleden
bovenliggende
commit
74804eb487
5 gewijzigde bestanden met toevoegingen van 485 en 10 verwijderingen
  1. 213 3
      CAD_autophagy.R
  2. 259 0
      CLHLS_P/CLHLS_process.py
  3. 3 0
      HRS_P/HRS_preprocess.py
  4. 8 7
      MR相关工作.md
  5. 2 0
      README.md

+ 213 - 3
CAD_autophagy.R

@@ -33,6 +33,8 @@
 #加载包
 library(TwoSampleMR)
 # install.packages("ieugwasr")
+# install.packages("tidyverse", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
+
 library(ieugwasr)
 # 登陆token
 Sys.setenv(OPENGWAS_JWT="eyJhbGciOiJSUzI1NiIsImtpZCI6ImFwaS1qd3QiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJhcGkub3Blbmd3YXMuaW8iLCJhdWQiOiJhcGkub3Blbmd3YXMuaW8iLCJzdWIiOiJ6anQ3ODU2MzIxQGhvdG1haWwuY29tIiwiaWF0IjoxNzI0NTg2NzM3LCJleHAiOjE3MjU3OTYzMzd9.hseY1Y4YiYJ4WWi-WoQmm0VWQA6Ozqurj08HvJV8q6xh7D4Jfv5o3zdwBvU_4HhC87C59KWgXaBm5ZiMynqLSp5uFLlai_5ObswpbdLHtNRghZYti0wF7TRTGcXPa9_ICWvrEoIRNoAyj10I1nOrPE9RhRL0PrzHOYb9Y3L7oBDzmsch-boFLYE3R9ZhUoZSMMiZYNR9_PmYbJi2Lrd6C_tI1fCna9jzF5vHsQQReYgy4IS59rr_lDEwdE46dCIq2cHoDIZJ4ohArPHtoLaw5CXf0yUrY89Xo2J5KnVWHtzlFK7EEgOIpx1DX6mSgrVxXtvlFSM-j7K1qbM9Awr6JQ")
@@ -104,8 +106,216 @@ result <- mr(dat)
 # #漏斗图
 # mr_funnel_plot(results_single)
 
+
+######################################################################
+######################################################################
+
+
+
 # 本地文件孟德尔分析
-install.packages("vroom")
+# Major coronary heart disease event
+# Coronary atherosclerosis
+# install.packages("vroom")
+#加载包
+library(TwoSampleMR)
 library(vroom)
-setwd("/root/zjt")
-a <- vroom("11510_31_APOL1_Apo_L1.txt", col_names = TRUE)
+library(tidyverse)
+library(tools)
+library(ieugwasr)
+setwd("/root/r_base")
+
+#读取eaf文件
+eaf_dat_ori <- vroom("protein_data/eaf_data/assocvariants.annotated.txt.gz", col_names = TRUE)
+eaf_dat = eaf_dat_ori[, c(3,7)]
+colnames(eaf_dat)
+
+# 指定文件夹路径
+folder_path <- "protein_data"
+
+# 获取所有.txt.gz文件
+file_list <- list.files(path = folder_path, pattern = "\\.txt\\.gz$", full.names = TRUE)
+
+# 遍历每个文件
+for (file in file_list) {
+  # 输出当前处理的文件名
+  file_name = file_path_sans_ext(file_path_sans_ext(basename(file)))
+  #检查是否已经处理过
+  if (file.exists(paste0("protein_rds_data/",file_name, ".rds"))) {
+    print("文件存在")
+    next
+  } 
+  print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file)))))
+  #读取暴露
+  expo_dat_ori <- vroom(file, col_names = TRUE)
+  # 暴露p < 5e-08过滤一下,强相关
+  print("=========暴露p < 5e-08过滤一下,强相关=================")
+  expo_dat <- subset(expo_dat_ori, expo_dat_ori$Pval<5e-08)
+
+  #eaf填充
+  print("=========eaf填充=================")
+  expo_dat <- left_join(expo_dat, eaf_dat, by = "Name")
+  print(colnames(expo_dat))
+  print(dim(expo_dat))
+
+  #标准化列名
+  expo_dat = expo_dat[, c(1,2,4,5,6,7,8,10,11,13)]
+  colnames(expo_dat) = c("chr", "pos", "SNP", "effect_allele", "other_allele", "beta", "pval", "se", "samplesize", "eaf")
+  expo_dat$phenotype_col <- strsplit(file_name, "_")[[1]][3]
+  #进行格式转换
+  print("=========格式转换=================")
+  expo_dat <- TwoSampleMR::format_data(
+    expo_dat,
+    type = "exposure",
+    phenotype_col = "phenotype_col"
+  )
+  print(colnames(expo_dat))
+  print(dim(expo_dat))
+  #去除连锁不平衡
+  print("=========去除连锁不平衡=================")
+  # expo_dat_clump <- clump_data(expo_dat, clump_kb = 10000, clump_r2 = 0.001)
+  exposure_data_clumped <- ld_clump(
+    # 指定包含`rsid`、`pval`和`id`的数据框
+    dat = tibble(rsid = expo_dat$SNP,  # 从 exposure_data 中提取 SNP 标识符
+                pval = expo_dat$pval.exposure,  # 从 exposure_data 中提取关联 p 值
+                id = expo_dat$exposure),  # 从 exposure_data 中提取 SNP 的 id
+    clump_kb = 10000,      # 设置 clumping 窗口大小
+    clump_r2 = 0.001,      # 指定用于 LD 的 r^2 阈值
+    clump_p = 1,           # 设置 p 值阈值为 1,这样所有的 SNP 都会被纳入 clump,因为咱们前面已经进行过 p 值过滤了嘛
+    bfile = "clump_file/EUR",       # 指定 LD 的参考数据集,注意修改你的路径
+    plink_bin = "clump_file/plink"  # 使用指定目录下的 plink 二进制文件,注意修改你的路径
+  )
+  exposure_data_clumped <- expo_dat %>% filter(expo_dat$SNP %in% exposure_data_clumped$rsid)
+  print(colnames(exposure_data_clumped))
+  print(dim(exposure_data_clumped))
+  #保存为rds
+  saveRDS(exposure_data_clumped, paste0("protein_rds_data/",file_name, ".rds"))
+}
+
+# 读取结局
+outcome_dat_ori <- vroom("CHD/finngen_R11_I9_CORATHER.gz", col_names = TRUE)
+colnames(outcome_dat_ori)
+# 标准化列名
+outcome_dat_ori = outcome_dat_ori[, c(1,2,3,4,5,7,9,10,11)]
+colnames(outcome_dat_ori) = c("chr", "pos", "other_allele", "effect_allele","SNP", "pval", "beta", "se","eaf")
+outcome_dat_ori$phenotype_col <- "finngen_R11_I9_CORATHER"
+
+# 循环进行等位基因对齐
+# 指定文件夹路径
+folder_rds_path <- "protein_rds_data"
+
+# 获取所有.txt.gz文件
+file_rds_list <- list.files(path = folder_rds_path, pattern = "\\.rds$", full.names = TRUE)
+# 遍历每个文件进行等位基因对齐
+for (file_rds in file_rds_list) {
+  # 输出当前处理的文件名
+  file_name = file_path_sans_ext(file_path_sans_ext(basename(file_rds)))
+  print(paste("Processing file:", file_path_sans_ext(file_path_sans_ext(basename(file_rds)))))
+  #读取rds
+  exposure_data_clumped = readRDS(file_rds)
+  #暴露和结局取交集,确定哪些SNP是两者共有的
+  print("=========暴露和结局取交集=================")
+  data_common <- merge(exposure_data_clumped, outcome_dat_ori, by = "SNP")
+  print(dim(data_common))
+  print(colnames(data_common))
+  if (nrow(data_common) > 0) {
+    # 提取结局数据,并进行格式修改,以用于后续的 MR 分析
+    outcome_dat <- format_data(outcome_dat_ori,
+                                # 设置类型为 "outcome",表示这是一个结局数据
+                                type = "outcome",
+                                # 选择用于 MR 分析的 SNP,这里使用了 data_common 数据集中的 SNP 列
+                                snps = data_common$SNP,
+                                # 设置 SNP 列的名称
+                                snp_col = "SNP",
+                                # 设置效应大小 (beta) 的列名称
+                                beta_col = "beta",
+                                # 设置标准误差 (SE) 的列名称
+                                se_col = "se",
+                                # 设置效应等位基因频率 (EAF) 的列名称
+                                eaf_col = "eaf",
+                                # 设置效应等位基因的列名称
+                                effect_allele_col = "effect_allele",
+                                # 设置非效应等位基因的列名称
+                                other_allele_col = "other_allele",
+                                # 设置 p 值 (p-value) 的列名称
+                                pval_col = "pval",
+                                # 设置样本大小 (sample size) 的列名称
+                                samplesize_col = "samplesize",
+                                # 设置病例数 (number of cases) 的列名称
+                                ncase_col = "ncase",
+                                # 设置 id 的列名称
+                                id_col = "id",
+                                phenotype_col = "phenotype_col")
+
+    #等位基因对齐
+    print("=========等位基因对齐=================")
+    dat <- harmonise_data(exposure_dat = exposure_data_clumped, outcome_dat = outcome_dat)
+    print(dim(dat))
+    print(colnames(dat))
+    #保存为rds
+    saveRDS(dat, paste0("harmonise_data_CORATHER/",file_name, ".rds"))
+  } else {
+    print("数据条数为 0")
+  }
+}
+
+# 循环进行MR分析
+# 指定文件夹路径
+folder_rds_path_m <- "harmonise_data_CHD"
+
+# 获取所有.txt.gz文件
+file_rds_list_m <- list.files(path = folder_rds_path_m, pattern = "\\.rds$", full.names = TRUE)
+
+# 遍历每个文件进行mr分析
+result_all <- data.frame()
+for (file_rds_m in file_rds_list_m) {
+  #读取rds
+  dat = readRDS(file_rds_m)
+  #去掉结局相关SNP
+  dat <- subset(dat, dat$pval.outcome>=5e-08)
+  print("=========MR 分析=================")
+  # MR 分析
+  result <- mr(dat)
+  result_all <- rbind(result_all, result)
+}
+# 保存为 CSV 文件
+write.csv(result_all, file = "result_all_CHD.csv", row.names = FALSE)
+
+a = read.csv("result_all_CHD.csv")
+View(a)
+
+# b = read.csv("result_all_CHD.csv")
+# View(b)
+
+
+#单独跑MR分析
+dat = readRDS("harmonise_data_CORATHER_result/18387_7_ST13_suppression_of_tumorigenicity_13.rds")
+dat <- subset(dat, dat$pval.outcome>=5e-08)
+result <- mr(dat)
+View(result)
+#生成OR值 
+generate_odds_ratios(result)
+
+# 异质性检验
+mr_heterogeneity(dat)
+
+# 水平多效性检验
+mr_pleiotropy_test(dat)
+
+# 散点图
+p1 <- mr_scatter_plot(result, dat)
+p1
+
+# 森林图
+result_single <- mr_singlesnp(dat)
+p2 <- mr_forest_plot(result_single)
+p2
+
+# 留一图
+result_loo <- mr_leaveoneout(dat)
+p3 <- mr_leaveoneout_plot(result_loo)
+p3
+
+# 漏斗图
+result_single <- mr_singlesnp(dat)
+p4 <- mr_funnel_plot(result_single)
+p4

+ 259 - 0
CLHLS_P/CLHLS_process.py

@@ -0,0 +1,259 @@
+import pandas as pd
+import pyreadstat
+import numpy as np
+from savReaderWriter import SavReader
+
+def sav2csv(sav_file_path, csv_file_path):
+    #读取sav数据
+    with SavReader(sav_file_path) as reader:
+        header = reader.header
+        data = reader.all()
+    #转化为Dataframe
+    df = pd.DataFrame(data, columns=header)
+    # 将列名从字节串转换为字符串
+    df.columns = [col.decode('utf-8') if isinstance(col, bytes) else col for col in df.columns]
+    # 另存为csv数据
+    df.to_csv(csv_file_path, index=False)
+
+#进行adl转换
+def trans_adl(columns_adl, data,  result, adl_name):
+    # 定义转换规则
+    transformation = {1: 0, 2: 1, 3: 2}
+    for column in columns_adl:
+        data[column] = data[column].map(transformation).fillna(np.nan)  # 将其他转换为缺失值
+    # 计算转换后列的总和并创建一个新列
+    data[adl_name] = data[columns_adl].sum(axis=1)
+    result[adl_name] = data[adl_name].apply(lambda x : 0 if x==0 else (1 if x>0 and x<=12 else np.nan))
+
+#进行mmse转换
+def trans_mmse(columns_mmse, data):
+    # 定义转换规则
+    transformation = {0: 0, 1: 1}
+    for column in columns_mmse:
+        data[column] = data[column].map(transformation).fillna(np.nan)  # 将其他转换为缺失值
+
+#进行mmsec16转换
+def trans_mmse_c16(columns_mmse, data):
+    # 定义转换规则
+    for column in columns_mmse:
+        data[column] = data[column].apply(lambda x : np.nan if x==88 or x==99 or x==-9 or x==-8 or x==-7 or x==-6 else 1)  # 将其他转换为缺失值
+
+def get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, cognitive_name):
+    # 计算一般能力
+    result["general_cognitive_"+cognitive_name] = data[columns_cognitive_98].sum(axis=1) *2
+    # 计算反应能力
+    result["reaction_"+cognitive_name] = data[columns_reaction_98].sum(axis=1)
+    # 计算注意力与计算力
+    result["attention_calculation_"+cognitive_name] = data[columns_attention_98].sum(axis=1)
+    # 计算回忆力
+    result["memory_"+cognitive_name] = data[columns_memory_98].sum(axis=1)
+    # 计算语言能力和自我协调
+    result["language_selfcoordination_"+cognitive_name] = data[columns_language_98].sum(axis=1)
+    # 计算总合
+    result['mmse_'+cognitive_name] = result["general_cognitive_"+cognitive_name] + result["reaction_"+cognitive_name]+ result["attention_calculation_"+cognitive_name]+ result["memory_"+cognitive_name]+ result["language_selfcoordination_"+cognitive_name]
+
+if __name__ == "__main__":
+    sav_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.sav"
+    csv_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.csv"
+    # 将sav数据转为csv
+    # sav2csv(sav_file_path, csv_file_path)
+    #处理数据
+    data = pd.read_csv(csv_file_path)
+    # 存活状态0存活;1死亡;-9失访;-8死亡/失访
+    result = data[['id', 'dth98_00','dth00_02', 'dth02_05', 'dth02_05', 'dth05_08', 'dth08_11', 'dth11_14', 'dth14_18']]
+    # 人口特征学变量
+    # 8/9代表无法回答和缺失
+    # 年龄
+    result[['trueage_98','trueage_00','trueage_02','trueage_05','trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_0','vage_2', 'vage_5', 'vage_8','vage_11', 'trueage_14', 'trueage_18']]
+    # 性别 1男;0女
+    result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
+    # 民族 1汉族;0非汉族
+    result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
+    # 出生地 1城市;0农村
+    result['birth_place'] = data['a42'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
+    # 教育状况 无11年
+    result['edu_98'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    result['edu_08'] = data['f1_8'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
+    result['marital_98'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    result['marital_00'] = data['f41_0'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 else 1))
+    result['marital_02'] = data['f41_2'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
+    result['marital_05'] = data['f41_5'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
+    result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 or x==-7 else 1))
+    result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
+    result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
+    result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    # 生活是否富裕 1富裕及以上;0一般及以下
+    result['econ_state_00'] = data['f34_0'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
+    result['econ_state_02'] = data['f34_2'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_05'] = data['f34_5'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_08'] = data['f34_8'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    # 上一年家庭收入 99998超过10万
+    result['income_02'] = data['f35_2'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_05'] = data['f35_5'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_08'] = data['f35_8'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
+    # 居住状态 1与家庭成员同住;2独居;3在机构居住
+    result['co_residence_98'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
+    result['co_residence_00'] = data['a51_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
+    result['co_residence_02'] = data['a51_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['co_residence_05'] = data['a51_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['co_residence_08'] = data['a51_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
+    result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
+    # 目前是否吸烟 1是;2否
+    result['smoke_98'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
+    result['smoke_00'] = data['d71_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
+    result['smoke_02'] = data['d71_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['smoke_05'] = data['d71_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['smoke_08'] = data['d71_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
+    result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
+    # 目前是否饮酒 1是;2否
+    result['drink_98'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
+    result['drink_00'] = data['d81_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
+    result['drink_02'] = data['d81_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['drink_05'] = data['d81_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['drink_08'] = data['d81_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
+    result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 目前是否锻炼
+    result['exercise_98'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
+    result['exercise_00'] = data['d91_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
+    result['exercise_02'] = data['d91_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['exercise_05'] = data['d91_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
+    result['exercise_08'] = data['d91_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
+    result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad; 
+    result['self_reported_helth_98'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
+    result['self_reported_helth_00'] = data['b12_0'].apply(lambda x : np.nan if x==8 or x==-9 or x == -6 else x)
+    result['self_reported_helth_02'] = data['b12_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
+    result['self_reported_helth_05'] = data['b12_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
+    result['self_reported_helth_08'] = data['b12_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 慢性病
+    result['chronic_00'] = data['g14a1_0'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
+    result['chronic_02'] = data['g14a1_2'].apply(lambda x : np.nan if x==-9 or x == -8 or x == -6 or x == -1 else x)
+    result['chronic_05'] = data['g14a1_5'].apply(lambda x : np.nan if x==888 or x==999 or x==-9 or x == -8 or x == -6 or x == -1 else x)
+    result['chronic_08'] = data['g14a1_8'].apply(lambda x : np.nan if x==66 or x==88 or x == 99 or x==-9 or x == -8 or x == -7 or x == -1 else x)
+    result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
+    result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
+    result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
+    
+    # 抑郁量表得分-only 18年  0无抑郁症;1有抑郁症
+    # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
+    # 定义转换规则
+    transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
+    # 应用转换规则
+    columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
+    for column_one in columns_cesd_one:
+        data[column_one] = data[column_one].map(transformation_one).fillna(np.nan)  # 将8转换为缺失值
+    # 定义转换规则
+    transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
+    # 应用转换规则
+    columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
+    for column_two in columns_cesd_two:
+        data[column_two] = data[column_two].map(transformation_two).fillna(np.nan)  # 将8转换为缺失值
+    result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18'] 
+    result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
+    
+    # 日常生活活动能力 0无残疾;1有残疾
+    # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
+    columns_adl_00 = ['e1_0', 'e2_0', 'e3_0', 'e4_0', 'e5_0', 'e6_0']
+    trans_adl(columns_adl_00, data, result, "adl_00")
+    columns_adl_02 = ['e1_2', 'e2_2', 'e3_2', 'e4_2', 'e5_2', 'e6_2']
+    trans_adl(columns_adl_02, data, result, "adl_02")
+    columns_adl_05 = ['e1_5', 'e2_5', 'e3_5', 'e4_5', 'e5_5', 'e6_5']
+    trans_adl(columns_adl_05, data, result, "adl_05")
+    columns_adl_08 = ['e1_8', 'e2_8', 'e3_8', 'e4_8', 'e5_8', 'e6_8']
+    trans_adl(columns_adl_08, data, result, "adl_08")
+    columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
+    trans_adl(columns_adl_11, data, result, "adl_11")
+    columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
+    trans_adl(columns_adl_14, data, result, "adl_14")
+    columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
+    trans_adl(columns_adl_18, data, result, "adl_18")
+    # 认知功能 0有认知功能障碍;1认知功能正常
+    # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
+    # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
+    columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a",  "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
+                    "c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c21a_0", "c21b_0", "c21c_0", "c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0", "c41a_0",  "c41b_0", "c41c_0", "c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0",
+                    "c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c21a_2", "c21b_2", "c21c_2", "c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2", "c41a_2",  "c41b_2", "c41c_2", "c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2",
+                    "c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c21a_5", "c21b_5", "c21c_5", "c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5", "c41a_5",  "c41b_5", "c41c_5", "c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5",
+                    "c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c21a_8", "c21b_8", "c21c_8", "c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8", "c41a_8",  "c41b_8", "c41c_8", "c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8",
+                    "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11",  "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
+                    "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14",  "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
+                    "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18",  "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
+    trans_mmse(columns_mmse, data)
+    columns_mmse_c16 = ["c16", "c16_0", "c16_2", "c16_5", "c16_8", "c16_11", "c16_14", "c16_8"]
+    trans_mmse_c16(columns_mmse, data)
+    columns_cognitive_98 = ["c11", "c12", "c13", "c14", "c15", "c16"]
+    columns_reaction_98 = ["c21a", "c21b", "c21c"]
+    columns_attention_98 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
+    columns_memory_98 = ["c41a",  "c41b", "c41c"]
+    columns_language_98 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
+    get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, "98")
+    columns_cognitive_00 = ["c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c16_0"]
+    columns_reaction_00 = ["c21a_0", "c21b_0", "c21c_0"]
+    columns_attention_00 = ["c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0"]
+    columns_memory_00 = ["c41a_0",  "c41b_0", "c41c_0"]
+    columns_language_00 = ["c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0"]
+    get_mmse(columns_cognitive_00,columns_reaction_00,columns_attention_00, columns_memory_00 ,columns_language_00, data, result, "00")
+    columns_cognitive_02 = ["c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c16_2"]
+    columns_reaction_02 = ["c21a_2", "c21b_2", "c21c_2"]
+    columns_attention_02 = ["c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2"]
+    columns_memory_02 = ["c41a_2",  "c41b_2", "c41c_2"]
+    columns_language_02 = ["c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2"]
+    get_mmse(columns_cognitive_02,columns_reaction_02,columns_attention_02, columns_memory_02 ,columns_language_02, data, result, "02")
+    columns_cognitive_05 = ["c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c16_5"]
+    columns_reaction_05 = ["c21a_5", "c21b_5", "c21c_5"]
+    columns_attention_05 = ["c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5"]
+    columns_memory_05 = ["c41a_5",  "c41b_5", "c41c_5"]
+    columns_language_05 = ["c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5"]
+    get_mmse(columns_cognitive_05,columns_reaction_05,columns_attention_05, columns_memory_05 ,columns_language_05, data, result, "05")
+    columns_cognitive_08 = ["c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c16_8"]
+    columns_reaction_08 = ["c21a_8", "c21b_8", "c21c_8"]
+    columns_attention_08 = ["c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8"]
+    columns_memory_08 = ["c41a_8",  "c41b_8", "c41c_8"]
+    columns_language_08 = ["c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8"]
+    get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
+    columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
+    columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
+    columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
+    columns_memory_11 = ["c41a_11",  "c41b_11", "c41c_11"]
+    columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
+    get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
+    columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
+    columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
+    columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
+    columns_memory_14 = ["c41a_14",  "c41b_14", "c41c_14"]
+    columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
+    get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
+    columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
+    columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
+    columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
+    columns_memory_18 = ["c41a_18",  "c41b_18", "c41c_18"]
+    columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
+    get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
+    print(result.head())
+    result.to_csv("CLHLS/clhls_1998_2018_result.csv", index=False)
+    print(123)
+
+
+
+

+ 3 - 0
HRS_P/HRS_preprocess.py

@@ -91,6 +91,9 @@ if __name__ == "__main__":
         data = {
             "HHID":HHID_list,
             "PN":PN_list,
+            "BORN_YEAR":BORN_YEAR_list,
+            "SEX":SEX_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
             "EDUCATION":EDUCATION_list
         }
         data["WAVE"] = 2006

+ 8 - 7
MR相关工作.md

@@ -190,15 +190,16 @@ HyenaDNA: Long-Range Genomic Sequence Modeling at Single Nucleotide Resolution
 
 # 本周工作 8.26
 1. 蛋白与冠心病相关性的分析思路:
-   - 去除弱工具变量,F>=10
    - 关联性分析:(p<5*e-8)
    - 边锁不平衡:(设置clump_kb=10000, clump_r2 =0.001)
    - 孟德尔随机化分析,筛选疾病相关的暴露
-   - 结果可视化:散点图、森林图、留一法敏感性分析、漏斗图
-   - 异质性检测
-   - 敏感性分析
-   - 多效性分析
+   - 结果可视化:散点图(看斜率,斜率越大结果最好)、森林图(意义不大)、留一法敏感性分析、漏斗图(对称)
+   - 异质性检测(不存在 p>0.05)有异质性不可怕,只是可信度相对低点
+   - 多效性分析(是否违背三条假设,不违背p>0.05)
+   - ivw p<0.05  MR egger
 2. 暴露基因要与工具变量相关,结局基因要与工具变量不相关
 3. 数据选择:同地区的不同样本集的数据(利用遗传变异与暴露之间的联系来推断遗传变异与结果之间的因果关系,因此暴露数据和结果数据必须来自不同的样本,以避免遗传相关性干扰因果推断。如果使用相同的样本用于暴露和结果数据,那么暴露和结果之间的相关性可能是由遗传相关性导致的,而不是真正的因果关系。)
-4. 心血管疾病的相关数据来自PhenoScanner
-5. 蛋白相关数据来自www.decode.com/summarydata/
+4. 心血管疾病的相关数据来自芬兰
+   - I9_CHD	Major coronary heart disease event	IX Diseases of the circulatory system (I9_)	51098 cases	402635 controls
+   - I9_CORATHER	Coronary atherosclerosis	IX Diseases of the circulatory system (I9_)	56685 cases 378019 controls
+5. 蛋白相关数据来自冰岛的35,559 Icelanders,地址:www.decode.com/summarydata/,相关论文"Large-scale integration of the plasma proteome with genetics and disease"

+ 2 - 0
README.md

@@ -2,3 +2,5 @@
 CHARLS_P中是对CHARLS数据进行处理的程序
 NHANES_P中是对NHANES数据进行处理的程序
 AreaCity-Query-Geometry中是对坐标数据进行处理的程序
+
+夜光暴露与空气污染对慢性非传染性疾病(高血压、心脏病、中风、糖尿病、关节炎、癌症和记忆相关疾病)的交互影响(赵竞腾)