JazzZhao 4 сар өмнө
parent
commit
6c74a7d365

+ 21 - 0
CHARLS_NL.py

@@ -0,0 +1,21 @@
+import pandas as pd
+
+
+#读取CHARLS数据
+CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
+#读取夜光数据
+pollutants_data = pd.read_csv("night_light_result.csv", encoding="utf-8")
+#处理哪一年的数据
+year = 2020
+#新增两列,分别为year的去年和前年的环境值
+# CHARLS_data[['last_year_pm2.5', "before_last_pm2.5"]]=''
+#开始筛选出year的数据
+CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+#两个表合并
+table_merge = pd.merge(CHARLS_data_year, pollutants_data, left_on="city", right_on="ext_name", how='left')
+# table_merge_last.to_csv("123.csv",index=False)
+#更新CHARLS表
+CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_nl'] = table_merge[str(year-1)].values
+CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_nl'] = table_merge[str(year-2)].values
+CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+print(year)

+ 59 - 0
CHARLS_PM.py

@@ -0,0 +1,59 @@
+import pandas as pd
+from glob import glob
+import os
+
+def pollutant_handle(CHARLS_data):
+    #读取污染物数据
+    pollutants_data = pd.read_csv("result_O3_p.csv")
+    #处理哪一年的数据
+    year = 2020
+    #开始筛选出year的数据
+    CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+    #两个表合并
+    table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
+    #更新CHARLS表
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
+    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+    print(year)
+
+def aba_handle(CHARLS_data):
+    #处理CHARLS数据的年份
+    year = 2020
+    path = "aba627/result/"
+    #读取污染物组分
+    last_year_file_name = path+str(year-1)+"_PM25_and_species_p.csv"
+    before_last_file_name = path+str(year-2)+"_PM25_and_species_p.csv"
+    last_year_pollutants_data = pd.read_csv(last_year_file_name)
+    before_last_pollutants_data = pd.read_csv(before_last_file_name)
+    #开始筛选出year的数据
+    CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+    #和上一年的污染物组分文件合并
+    last_table_merge = pd.merge(CHARLS_data_year, last_year_pollutants_data, on=['province', 'city'], how='left')
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_SO4'] = last_table_merge["SO4"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NO3'] = last_table_merge["NO3"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NH4'] = last_table_merge["NH4"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_OM'] = last_table_merge["OM"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_BC'] = last_table_merge["BC"].values
+    #和上上年的污染物组分文件合并
+    before_last_table_merge = pd.merge(CHARLS_data_year, before_last_pollutants_data, on=['province', 'city'], how='left')
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_SO4'] = before_last_table_merge["SO4"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NO3'] = before_last_table_merge["NO3"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NH4'] = before_last_table_merge["NH4"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_OM'] = before_last_table_merge["OM"].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_BC'] = before_last_table_merge["BC"].values
+    #更新CHARLS表
+    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+    print(year)
+
+if __name__ == "__main__":
+    #读取CHARLS数据
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
+    print(CHARLS_data.info())
+    # CHARLS_data1 = pd.read_csv("NHANES/result_all.csv")
+    # print(CHARLS_data1.info())
+    
+    #处理污染物
+    # pollutant_handle(CHARLS_data)
+    #处理PM2.5组分
+    # aba_handle(CHARLS_data)

+ 11 - 0
CHARLS_preprocess.py

@@ -0,0 +1,11 @@
+import pandas as pd
+
+
+
+if __name__ == "__main__":
+    path = "CHARLS_data_pollutants.csv"
+    data = pd.read_csv(path, encoding="utf-8")
+    print(data.info())
+    data["born_year"] = data.groupby("ID")["born_year"].transform(lambda x : x.fillna(x.mean()))
+    data["age"] = data["wave"] - data["born_year"]
+    data.to_csv("CHARLS_data_pollutants_born.csv", encoding="utf-8")

+ 417 - 0
CHARLS_preprocess.r

@@ -0,0 +1,417 @@
+#biomarkers体检信息
+#community社区信息(只有2011年数据)
+#demographic_background个人信息
+#family_information家庭成员信息
+#family_transfer家庭经济关系
+#individual_income 个人收入及资产
+#household_income家户收入、支出及资产(2011和2013需要计算,转exp_income_wealth)
+#household_roster家庭成员信息(2011),后面分成三块parent, Child, Other_HHmember
+#housing_characteristics住房信息
+#Exit_Interview退出信息(2013)
+#Verbal_Autopsy死因信息(2013)
+#Exit_Module退出问卷(2020)
+#exp_income_wealth 家庭收入已统计好
+#interviewer_observation访问员观察
+#psu社区代码与城市对应关系
+#特色模块
+#health_care_and_insurance医疗保健与保险
+#health_status_and_functioning健康状况与功能
+#work_retirement_and_pension工作退休及养老金
+#注:中间有一些跳转操作需要判断,主要是和变量缺失进行区分
+# install.packages("haven")
+# install.packages("readstata13",repos = "https://mirrors.sjtug.sjtu.edu.cn/cran/")
+library(haven)
+library(readstata13)
+library(dplyr)
+year = "2020"
+path = "/root/r_base/CHARLS/"
+# 读取文件
+if(year == "2011"){
+    demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/demographic_background.dta"))
+    psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/psu.dta"), encoding = "GBK")
+    biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/biomarkers.dta"))
+    blood <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Blood_20140429.dta"))
+    health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/health_status_and_functioning.dta"))
+    health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/health_care_and_insurance.dta"))
+    exp_income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/exp_income_wealth.dta"))
+}else if(year == "2013"){
+    demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
+    psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/PSU.dta"), encoding = "GBK")
+    biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Biomarker.dta"))
+    health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
+    health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
+    exp_income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/exp_income_wealth.dta"))
+}else if (year == "2015"){
+    demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
+    psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS","2013","/PSU.dta"), encoding = "GBK")
+    biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Biomarker.dta"))
+    blood <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Blood.dta"))
+    health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
+    health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
+    Household_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Household_Income.dta"))
+    Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
+}else if(year == '2018'){
+    demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
+    psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",'2013',"/PSU.dta"), encoding = "GBK")
+    health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
+    health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
+    Household_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Household_Income.dta"))
+    Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
+    Cognition <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Cognition.dta"))
+}else{
+    demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
+    psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",'2013',"/PSU.dta"), encoding = "GBK")
+    health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
+    Household_Income <- read_dta(paste0("CHARLS/CHARLS",year,"/Household_Income.dta"))
+    Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
+}
+#性别#年龄#居住地#婚姻状况
+if(year == '2011'){
+    data <- demo[, c('ID','householdID', 'communityID','rgender','ba002_1','be001')]
+}else if(year == "2013"){
+    data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3','ba002_1','be001')]
+}else if(year == "2015"){
+    data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001')]
+}else if(year == "2018"){
+    data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001')]
+}else if(year == "2020"){
+    data <- demo[, c('ID','householdID', 'communityID','ba001', 'ba003_1','ba011')]
+}
+#性别
+colnames(data)[4] <- "gender"
+#年龄
+colnames(data)[5] <- "born_year"
+#婚姻状况
+colnames(data)[6] <- "be001"
+data$age <- ifelse(is.na(data$born_year), NA, as.numeric(year)-data$born_year)
+data$wave <- year
+#居住地
+data <- merge(data, psu[,c('communityID', 'province', 'city')], by = "communityID", all.x = TRUE)
+
+#省份、城市名称和污染物数据格式对齐
+#海东地区->海东市
+data$city[data$city == "海东地区"] <- "海东市"
+#北京 -> 北京市
+data$city[data$city == "北京"] <- "北京市"
+data$province[data$province == "北京"] <- "北京市"
+#哈尔滨 -> 哈尔滨市
+data$city[data$city == "哈尔滨"] <- "哈尔滨市"
+#天津 -> 天津市
+data$city[data$city == "天津"] <- "天津市"
+data$province[data$province == "天津"] <- "天津市"
+#广西省 -> 广西壮族自治区
+data$province[data$province == "广西省"] <- "广西壮族自治区"
+#巢湖市 -> 合肥市
+data$city[data$city == "巢湖市"] <- "合肥市"
+#襄樊市->襄阳市
+data$city[data$city == "襄樊市"] <- "襄阳市"
+
+#身高#体重#收缩压#舒张压#脉搏
+if(year == '2011'){
+    biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002','ql002','qa011','qa012', 'qa013')]
+}else if(year == "2013"){
+    biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002','ql002','qa011','qa012', 'qa013')]
+}else if(year == "2015"){
+    biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002', 'ql002', 'qa011','qa012', 'qa013')]
+}
+if (year == '2011' | year == '2013' | year == '2015'){
+    data <- merge(data, biomarkers_select, by = c('ID','householdID', 'communityID'), all.x = TRUE)   
+}else{
+    # 列名列表
+    new_columns <- c('qi002', 'ql002', 'qa011','qa012', 'qa013')
+    # 通过循环创建新的列并赋值为NA
+    for (col_name in new_columns) {
+      data[[col_name]] <- NA
+    }
+}
+#白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,谷氨酸glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+#糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+if(year == '2011'){
+    blood <- subset(blood, select = -c(bloodweight, qc1_va003))
+}else if(year == '2015'){
+    blood <- blood[, c('ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
+                       , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')]
+}
+if(year == '2011' | year == '2015'){
+    colnames(blood) <- c('ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
+                       , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')
+    data <- merge(data, blood, by = c('ID'), all.x = TRUE)    
+}else{
+    # 列名列表
+    new_columns <- c('bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
+                       , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')
+
+    # 通过循环创建新的列并赋值为NA
+    for (col_name in new_columns) {
+      data[[col_name]] <- NA
+    }
+}
+
+#健康状况与功能:
+if(year == '2018'){
+    health_status$general_helth_status <-  health_status$da002
+}else if(year =="2020"){
+    health_status$general_helth_status <-  health_status$da001
+}else{
+    health_status$general_helth_status <- ifelse(is.na(health_status$da001), health_status$da002, health_status$da001)
+}
+
+#患病情况、运动情况、抽烟情况、饮酒情况
+if(year == '2013'){
+    names(health_status)[names(health_status) %in% c("dc006_1_s1", "dc006_1_s2", 'dc006_1_s3', 'dc006_1_s4', 'dc006_1_s5','dc006_1_s6', 
+                                                     'dc006_1_s7', 'dc006_1_s8','dc006_1_s9', 'dc006_1_s10','dc006_1_s11')]<- c("dc006s1", "dc006s2", 'dc006s3', 
+                                                                         'dc006s4', 'dc006s5','dc006s6', 'dc006s7','dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
+}else if(year == "2018"){
+    names(Cognition)[names(Cognition) %in% c("dc001_w4", "dc006_w4", 
+                                                     'dc003_w4', 'dc005_w4', 'dc002_w4')]<- c("dc001s1", "dc001s2",'dc001s3','dc002','dc003') 
+    names(Cognition)[names(Cognition) %in% c("dc014_w4_1_1", "dc014_w4_2_1", 
+                                                     'dc014_w4_3_1', 'dc014_w4_4_1', 'dc014_w4_5_1')]<- c("dc019", "dc020", 
+                                                                                              'dc021', 'dc022', 'dc023')
+    names(Cognition)[names(Cognition) %in% c("dc028_w4_s1", "dc028_w4_s2", 'dc028_w4_s3', 'dc028_w4_s4', 'dc028_w4_s5','dc028_w4_s6', 
+                                                     'dc028_w4_s7', 'dc028_w4_s8', 
+                                                     'dc028_w4_s9', 'dc028_w4_s10', 
+                                                     'dc028_w4_s11')]<- c("dc006s1", "dc006s2", 'dc006s3', 
+                                                                         'dc006s4', 'dc006s5','dc006s6', 'dc006s7', 
+                                                                         'dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
+    names(Cognition)[names(Cognition) %in% c("dc047_w4_s1", "dc047_w4_s2", 'dc047_w4_s3', 'dc047_w4_s4', 'dc047_w4_s5','dc047_w4_s6', 
+                                                 'dc047_w4_s7', 'dc047_w4_s8', 
+                                                 'dc047_w4_s9', 'dc047_w4_s10', 
+                                                 'dc047_w4_s11', 'dc024_w4')]<- c("dc027s1", "dc027s2", 'dc027s3', 
+                                                                     'dc027s4', 'dc027s5','dc027s6', 'dc027s7', 
+                                                                     'dc027s8', 'dc027s9', 'dc027s10', 'dc027s11','dc025')
+}else if (year == "2020"){
+    #词语记忆,第一遍                                                                            
+    names(health_status)[names(health_status) %in% c("dc012_s1", "dc012_s2", 'dc012_s3', 'dc012_s4', 'dc012_s5','dc012_s6','dc012_s7', 'dc012_s8','dc012_s9', 'dc012_s10', 
+                                                     'dc012_s11')]<- c("dc006s1", "dc006s2", 'dc006s3','dc006s4', 'dc006s5','dc006s6', 'dc006s7','dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
+    #词语记忆,第二遍   
+    names(health_status)[names(health_status) %in% c("dc028_s1", "dc028_s2", 'dc028_s3', 'dc028_s4', 'dc028_s5','dc028_s6','dc028_s7', 'dc028_s8','dc028_s9', 'dc028_s10', 
+                                                     'dc028_s11')]<- c("dc027s1", "dc027s2", 'dc027s3','dc027s4', 'dc027s5','dc027s6', 'dc027s7', 
+                                                     'dc027s8', 'dc027s9', 'dc027s10', 'dc027s11')
+}
+
+#日常生活活动能力(ADL):包括上厕所、吃饭、穿衣、控制大小便、上下床、洗澡6个条目,若其中有一项需要他人帮助,则视为ADL失能。>0为失能
+if (year == "2020"){
+    health_status$db010_score <- ifelse(health_status$db001 > 2, 1, 0)
+    health_status$db011_score <- ifelse(health_status$db003 > 2, 1, 0)
+    health_status$db012_score <- ifelse(health_status$db005 > 2, 1, 0)                                         
+    health_status$db013_score <- ifelse(health_status$db007 > 2, 1, 0)                                         
+    health_status$db014_score <- ifelse(health_status$db009 > 2, 1, 0)                                         
+    health_status$db015_score <- ifelse(health_status$db011 > 2, 1, 0)    
+}else{
+    health_status$db010_score <- ifelse(health_status$db010 > 2, 1, 0)
+    health_status$db011_score <- ifelse(health_status$db011 > 2, 1, 0)
+    health_status$db012_score <- ifelse(health_status$db012 > 2, 1, 0)                                         
+    health_status$db013_score <- ifelse(health_status$db013 > 2, 1, 0)                                         
+    health_status$db014_score <- ifelse(health_status$db014 > 2, 1, 0)                                         
+    health_status$db015_score <- ifelse(health_status$db015 > 2, 1, 0)
+}
+health_status$ADL_score <- apply(health_status[,c('db010_score','db011_score','db012_score', 'db013_score', 'db014_score'
+                                      ,'db015_score')], 1, function(x) sum(x))                                         
+                                         
+#IADL:包括做家务、做饭、购物、吃药、管理财务5个条目,若其中有一项需要他人帮助,则视为IADL失能。
+if (year =='2020'){
+    health_status$db016_score <- ifelse(health_status$db012 > 2, 1, 0)
+    health_status$db017_score <- ifelse(health_status$db014 > 2, 1, 0)
+    health_status$db018_score <- ifelse(health_status$db016 > 2, 1, 0)                                         
+    health_status$db019_score <- ifelse(health_status$db020 > 2, 1, 0)                                         
+    health_status$db020_score <- ifelse(health_status$db022 > 2, 1, 0) 
+}else{
+    health_status$db016_score <- ifelse(health_status$db016 > 2, 1, 0)
+    health_status$db017_score <- ifelse(health_status$db017 > 2, 1, 0)
+    health_status$db018_score <- ifelse(health_status$db018 > 2, 1, 0)                                         
+    health_status$db019_score <- ifelse(health_status$db019 > 2, 1, 0)                                         
+    health_status$db020_score <- ifelse(health_status$db020 > 2, 1, 0) 
+}
+health_status$IADL_score <- apply(health_status[,c('db016_score','db017_score','db018_score', 'db019_score', 'db020_score')], 1, function(x) sum(x))   
+
+if(year == "2020"){
+    #2020年疾病的label和其他年份不一样,需要处理
+    # 指定需要处理的列
+    columns_to_process <- c('da002_1_', 'da002_2_','da002_3_'
+                            ,'da002_4_','da002_5_','da002_6_','da002_7_','da002_8_','da002_9_','da002_10_','da002_11_'
+                            ,'da002_12_','da002_13_','da002_14_','da002_15_')
+    # 使用 mutate_at() 对指定列进行处理
+    health_status <- health_status %>%
+        mutate_at(vars(columns_to_process), ~ case_when(
+            . == 99 ~ 2,
+            . %in% 1:3 ~ 1,
+            TRUE ~ NA_real_
+        ))
+    # 2020年把帕金森和记忆病症分开,需要和以前对齐
+    # 使用 mutate() 和 case_when() 实现条件逻辑处理
+    health_status <- health_status %>%
+    mutate(
+        da002_12_ = case_when(
+        da002_12_ == 1 | da002_13_ == 1 ~ 1,
+        da002_12_ == 2 & da002_13_ == 2 ~ 2,
+        da002_12_ == 2 & is.na(da002_13_) | is.na(da002_12_) & da002_13_ == 2 ~ 2,
+        is.na(da002_12_) & is.na(da002_13_) ~ NA_real_,
+        TRUE ~ NA_real_  # 预防万一,其余情况下设为NA
+        )
+    )
+    health_status_select <- health_status[, c('ID','householdID', 'communityID', 'general_helth_status'
+                                   ,'ADL_score', 'IADL_score', 'da002_1_', 'da002_2_','da002_3_'
+                                   ,'da002_4_','da002_5_','da002_6_','da002_7_','da002_8_','da002_9_','da002_10_','da002_11_'
+                                   ,'da002_12_','da002_14_','da002_15_','da032_1_','da032_2_', 'da032_3_'
+                                   ,'da033_1_','da033_2_','da033_3_','da034_1_','da034_2_','da034_3_','da035_1_','da035_2_','da035_3_'
+                                    ,'da036_1_','da036_2_','da036_3_', 'da046','da047','da050_1'
+                                   ,'da051')]
+    health_status_select$da051 <- ifelse(health_status_select$da051==1, 3, ifelse(health_status_select$da051==3, 1, health_status_select$da051))
+}else{
+    health_status_select <- health_status[, c('ID','householdID', 'communityID', 'general_helth_status'
+                                   ,'ADL_score', 'IADL_score', 'da007_1_', 'da007_2_','da007_3_'
+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
+                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
+                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
+                                    ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
+                                   ,'da069')]
+}
+colnames(health_status_select) <- c('ID', 'householdID', 'communityID', 'general_helth_status'
+                            ,'ADL_score', 'IADL_score', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar'
+                             ,'Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', ' Kidney_Diease'
+                             ,'Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', ' Memory_Related_Disease',' Arthritis_or_Rheumatism'
+                             ,'Asthma', 'Vigorous_Activities', 'Moderate_Physical_Effort','Walking'
+                             ,'Vigorous_Activities_day', 'Moderate_Physical_Effort_day','Walking_day','Vigorous_Activities_2h', 'Moderate_Physical_Effort_2h','Walking_2h'
+                             ,'Vigorous_Activities_30m', 'Moderate_Physical_Effort_30m','Walking_30m','Vigorous_Activities_4h', 'Moderate_Physical_Effort_4h','Walking_4h'
+                             ,'Smoke', 'Smoke_still','Number_Cigarettes','Drink')
+data <- merge(data, health_status_select, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)  
+
+if(year =="2018"){
+    health_status = Cognition
+}
+#计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+if(year == "2020"){
+    health_status$dc001s1_score <- ifelse(is.na(health_status$dc001), 0, ifelse(health_status$dc001 == 1, 1, 0))
+    health_status$dc001s2_score <- ifelse(is.na(health_status$dc005), 0, ifelse(health_status$dc005 == 2, 1, 0))
+    health_status$dc001s3_score <- ifelse(is.na(health_status$dc003), 0, ifelse(health_status$dc003 == 3, 1, 0))
+    health_status$dc002_score <- ifelse(is.na(health_status$dc004), 0, ifelse(health_status$dc004 == 1, 1, 0))
+    health_status$dc003_score <- ifelse(is.na(health_status$dc002), 0, ifelse(health_status$dc002 == 1, 1, 0))
+    health_status$dc019_score <- ifelse(is.na(health_status$dc007_1), 0, ifelse(health_status$dc007_1 == 93, 1, 0))
+    health_status$dc020_score <- ifelse(is.na(health_status$dc007_2), 0, ifelse(health_status$dc007_2 == 86, 1, 0))
+    health_status$dc021_score <- ifelse(is.na(health_status$dc007_3), 0, ifelse(health_status$dc007_3 == 79, 1, 0))
+    health_status$dc022_score <- ifelse(is.na(health_status$dc007_4), 0, ifelse(health_status$dc007_4 == 72, 1, 0))
+    health_status$dc023_score <- ifelse(is.na(health_status$dc007_5), 0, ifelse(health_status$dc007_5 == 65, 1, 0))
+}else{
+    health_status$dc001s1_score <- ifelse(is.na(health_status$dc001s1), 0, ifelse(health_status$dc001s1 == 1, 1, 0))
+    health_status$dc001s2_score <- ifelse(is.na(health_status$dc001s2), 0, ifelse(health_status$dc001s2 == 2, 1, 0))
+    health_status$dc001s3_score <- ifelse(is.na(health_status$dc001s3), 0, ifelse(health_status$dc001s3 == 3, 1, 0))
+    health_status$dc002_score <- ifelse(is.na(health_status$dc002), 0, ifelse(health_status$dc002 == 1, 1, 0))
+    health_status$dc003_score <- ifelse(is.na(health_status$dc003), 0, ifelse(health_status$dc003 == 1, 1, 0))
+    health_status$dc019_score <- ifelse(is.na(health_status$dc019), 0, ifelse(health_status$dc019 == 93, 1, 0))
+    health_status$dc020_score <- ifelse(is.na(health_status$dc020), 0, ifelse(health_status$dc020 == 86, 1, 0))
+    health_status$dc021_score <- ifelse(is.na(health_status$dc021), 0, ifelse(health_status$dc021 == 79, 1, 0))
+    health_status$dc022_score <- ifelse(is.na(health_status$dc022), 0, ifelse(health_status$dc022 == 72, 1, 0))
+    health_status$dc023_score <- ifelse(is.na(health_status$dc023), 0, ifelse(health_status$dc023 == 65, 1, 0))
+}
+health_status$Cognitive_functioning <- apply(health_status[,c('dc001s1_score','dc001s2_score','dc001s3_score', 'dc002_score', 'dc003_score'
+                                      ,'dc019_score','dc020_score','dc021_score','dc022_score','dc023_score')], 1, function(x) sum(x))
+
+#词语记忆
+health_status$dc006s1_score <- ifelse(is.na(health_status$dc006s1), 0, ifelse(health_status$dc006s1 == 1, 1, 0))
+health_status$dc006s2_score <- ifelse(is.na(health_status$dc006s2), 0, ifelse(health_status$dc006s2 == 2, 1, 0))
+health_status$dc006s3_score <- ifelse(is.na(health_status$dc006s3), 0, ifelse(health_status$dc006s3 == 3, 1, 0))
+health_status$dc006s4_score <- ifelse(is.na(health_status$dc006s4), 0, ifelse(health_status$dc006s4 == 4, 1, 0))
+health_status$dc006s5_score <- ifelse(is.na(health_status$dc006s5), 0, ifelse(health_status$dc006s5 == 5, 1, 0))
+health_status$dc006s6_score <- ifelse(is.na(health_status$dc006s6), 0, ifelse(health_status$dc006s6 == 6, 1, 0))                                             
+health_status$dc006s7_score <- ifelse(is.na(health_status$dc006s7), 0, ifelse(health_status$dc006s7 == 7, 1, 0))
+health_status$dc006s8_score <- ifelse(is.na(health_status$dc006s8), 0, ifelse(health_status$dc006s8 == 8, 1, 0))
+health_status$dc006s9_score <- ifelse(is.na(health_status$dc006s9), 0, ifelse(health_status$dc006s9 == 9, 1, 0))                                             
+health_status$dc006s10_score <- ifelse(is.na(health_status$dc006s10), 0, ifelse(health_status$dc006s10 == 10, 1, 0))                                             
+health_status$dc006s11_score <- ifelse(is.na(health_status$dc006s11), 0, ifelse(health_status$dc006s11 == 11, 1, 0))
+health_status$dc027s1_score <- ifelse(is.na(health_status$dc027s1), 0, ifelse(health_status$dc027s1 == 1, 1, 0))
+health_status$dc027s2_score <- ifelse(is.na(health_status$dc027s2), 0, ifelse(health_status$dc027s2 == 2, 1, 0))
+health_status$dc027s3_score <- ifelse(is.na(health_status$dc027s3), 0, ifelse(health_status$dc027s3 == 3, 1, 0))
+health_status$dc027s4_score <- ifelse(is.na(health_status$dc027s4), 0, ifelse(health_status$dc027s4 == 4, 1, 0))
+health_status$dc027s5_score <- ifelse(is.na(health_status$dc027s5), 0, ifelse(health_status$dc027s5 == 5, 1, 0))
+health_status$dc027s6_score <- ifelse(is.na(health_status$dc027s6), 0, ifelse(health_status$dc027s6 == 6, 1, 0))                                             
+health_status$dc027s7_score <- ifelse(is.na(health_status$dc027s7), 0, ifelse(health_status$dc027s7 == 7, 1, 0))
+health_status$dc027s8_score <- ifelse(is.na(health_status$dc027s8), 0, ifelse(health_status$dc027s8 == 8, 1, 0))
+health_status$dc027s9_score <- ifelse(is.na(health_status$dc027s9), 0, ifelse(health_status$dc027s9 == 9, 1, 0))                                             
+health_status$dc027s10_score <- ifelse(is.na(health_status$dc027s10), 0, ifelse(health_status$dc027s10 == 10, 1, 0))                                             
+health_status$dc027s11_score <- ifelse(is.na(health_status$dc027s11), 0, ifelse(health_status$dc027s11 == 11, 1, 0))
+health_status$remenber_functioning <- apply(health_status[,c('dc006s1_score','dc006s2_score','dc006s3_score', 'dc006s4_score', 'dc006s5_score'
+                                      ,'dc006s6_score','dc006s7_score','dc006s8_score','dc006s9_score','dc006s10_score'
+                                    ,'dc006s11_score','dc027s1_score','dc027s2_score','dc027s3_score','dc027s4_score','dc027s5_score'
+                                    ,'dc027s6_score','dc027s7_score','dc027s8_score','dc027s9_score','dc027s10_score','dc027s11_score')], 1, function(x) sum(x)/2)
+#画图
+if(year == "2020"){
+    health_status$draw_score <- ifelse(is.na(health_status$dc009), 0, ifelse(health_status$dc009 == 1, 1, 0))
+}else{
+    health_status$draw_score <- ifelse(is.na(health_status$dc025), 0, ifelse(health_status$dc025 == 1, 1, 0))
+}
+
+#心理得分
+if(year == '2020'){
+    health_status$dc009_score <- health_status$dc016-1
+    health_status$dc010_score <- health_status$dc017-1
+    health_status$dc011_score <- health_status$dc018-1
+    health_status$dc012_score <- health_status$dc019-1                                            
+    health_status$dc013_score <- 4 - health_status$dc020                                        
+    health_status$dc014_score <- health_status$dc021-1                                            
+    health_status$dc015_score <- health_status$dc022-1                                            
+    health_status$dc016_score <- 4 - health_status$dc023
+    health_status$dc017_score <- health_status$dc024-1                                            
+    health_status$dc018_score <- health_status$dc025-1 
+}else{
+    health_status$dc009_score <- health_status$dc009-1
+    health_status$dc010_score <- health_status$dc010-1
+    health_status$dc011_score <- health_status$dc011-1
+    health_status$dc012_score <- health_status$dc012-1                                            
+    health_status$dc013_score <- 4 - health_status$dc013                                        
+    health_status$dc014_score <- health_status$dc014-1                                            
+    health_status$dc015_score <- health_status$dc015-1                                            
+    health_status$dc016_score <- 4 - health_status$dc016
+    health_status$dc017_score <- health_status$dc017-1                                            
+    health_status$dc018_score <- health_status$dc018-1 
+}
+                                           
+health_status$psychiatric_score <- apply(health_status[,c('dc009_score','dc010_score','dc011_score', 'dc012_score', 'dc013_score'
+                                      ,'dc014_score','dc015_score','dc016_score','dc017_score','dc018_score')], 1, function(x) sum(x))
+
+health_status <- health_status[, c('ID','householdID', 'communityID','Cognitive_functioning','remenber_functioning'
+                                   ,'draw_score','psychiatric_score')] 
+colnames(health_status) <- c('ID', 'householdID', 'communityID','Cognitive_functioning','remenber_functioning'
+                            ,'draw_score','psychiatric_score')
+data <- merge(data, health_status, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)  
+
+#住院情况
+if (year != '2020'){
+    health_care =  health_care[, c('ID','householdID', 'communityID', 'ee003', 'ee004')]
+    colnames(health_care) <- c('ID','householdID', 'communityID', 'received_inpatient_care',"Frequency_one_year")
+    data <- merge(data, health_care, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)  
+}else{
+    data['received_inpatient_care'] <- NA
+    data['Frequency_one_year'] <- NA
+}
+
+#个人收入情况
+if (year == '2011' | year == '2013'){
+    exp_income =  exp_income[, c('ID','householdID', 'communityID','INDV_INCOME')]
+    data <- merge(data, exp_income, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)  
+}else {
+    Individual_Income$INDV_INCOME <- ifelse(Individual_Income$ga001==2, 0, Individual_Income$ga002)
+    data <- merge(data, Individual_Income[,c('ID','householdID', 'communityID','INDV_INCOME')], by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
+    # Household_Income$INCOME_TOTAL <- apply(Household_Income[,c('ga006_1_1_','ga006_1_2_','ga006_1_3_'
+    #                                         ,'ga006_1_4_','ga006_1_5_','ga006_1_6_','ga006_1_7_','ga006_1_8_','ga006_1_9_','ga006_1_10_')], 1, function(x) sum(x, na.rm = TRUE))
+}
+write.csv(data, file = paste0(path, "result", year, ".csv"), row.names = FALSE)
+
+
+#合并
+csv_files <- list.files(path = path, pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
+df_combined <- NA
+# 确保读取文件的路径是完整的
+if (length(csv_files) > 0) {
+  for (file in csv_files) {
+    # 读取每个.csv文件
+    data <- read.csv(file, stringsAsFactors = FALSE)
+    print(ncol(data))
+    if (length(df_combined) == 0){
+      df_combined <- data
+    }else{
+      df_combined <- rbind(data, df_combined)
+    }
+    print(paste("Read file:", file))
+  }
+}
+write.csv(df_combined, file = paste0("/root/r_base/CHARLS/", "result_all", ".csv"), row.names = FALSE)

+ 28 - 0
Readme_NL.md

@@ -0,0 +1,28 @@
+## CHARLS(中国健康与养老追踪调查)
+96617条数据,42456个用户,94个特征
+
+## NHANES(国家健康和营养检查调查)
+101317个用户,270个特征
+
+
+## 污染物数据
+TAP数据:PM2.5组分(SO4²⁻,NO3-,NH4+铵根离子,OM有机物,BC黑碳)
+CHAP数据:PM1;PM2.5;PM10;O3
+
+## 夜光数据处理
+## DMSP-OLS数据处理(1992~2013)
+1. 使用中国的shp文件对夜光数据进行“按掩膜提取” 
+2. 进行饱合校正(中国数据以黑龙江鹤岗为不变目标区域)
+    - 确定不变目标区域:最简单的是以GDP增加率最低的为不变目标区域;或以DN值增长率最低
+    - 选择F162006鹤岗为参考对象,也就是X,其他33期分别进行多种函数(线性、幂函数、对数、指数、二次)拟合,即Y
+    - 投影栅格工具,将34期DMSP数据投影变换为兰伯特等面积投影,选择Asia_Lambert_Conformal_Conic坐标系,X、Y设置为1000,完成投影和重采样
+    - 使用中国的shp文件对投影后的夜光数据进行“按掩膜提取”
+    - 将34期栅格数据转为点数据(转点工具),得到每一个栅格的DN值,然后使用表转Excel工具导出excel,最后分别与F162006年进行拟合
+    - 按照参数进行饱合校正
+3. 相互校正,传感器之间的校正,取平均值
+4. 连续性校正,使用公式,保证后一年的值要比前一年的大
+
+## SNPP VIIRS数据处理(2012至今)
+1. 投影变换、裁剪、重采样
+2. 去除负值和异常值,选择北上广深作为全国的最大值,进行最大值校正;去掉负值
+3. 连续性校正,获取2016年的标准数据,然后以该标准进行连续性校正

+ 63 - 0
aba_data_preprocess.py

@@ -0,0 +1,63 @@
+import pandas as pd
+import requests
+from time import sleep
+import json
+import os
+from glob import glob
+
+def get_city(lon, lat):
+    params = {
+        "lng": "{:.6f}".format(lon),
+        "lat": "{:.6f}".format(lat)
+    }
+    flag = True
+    while(flag):
+        try:
+            response_work = requests.get(url="http://103.116.120.27:9527/queryPoint",params = params)
+            flag = False
+        except Exception as e:
+            print(f"请求错误一次:{e}")
+            sleep(10)
+            pass
+    res_json_work = json.loads(response_work.text)
+    #坐标在国内
+    list_local_work = res_json_work['v']['list']
+    if len(list_local_work) > 0:
+        try:
+            if len(list_local_work) == 1:
+                province_city_work = [local_work['ext_path'] for local_work in list_local_work if local_work['deep'] == '0']
+                return province_city_work[0], province_city_work[0]
+            else:
+                province_city_work = [local_work['ext_path'] for local_work in list_local_work if local_work['deep'] == '1']
+                return province_city_work[0].split(" ")[0], province_city_work[0].split(" ")[1]
+        except Exception as e:
+            print("发生成异常"+json.dumps(list_local_work))
+    else:
+        print(f"这是一个空的坐标:{lon},{lat}\n")
+        return "", ""
+
+
+if __name__ == "__main__":
+    folder_path = "aba627/"
+    result_path = "aba627/result/"
+    #拿到文件夹中所有的csv文件
+    csv_files = glob(folder_path+"*.csv")
+    for file_path in csv_files:
+        #对应省份和城市
+        province_list = []
+        city_list = []
+        data = pd.read_csv(file_path, encoding="utf-8")
+        #获取经纬度
+        lons = data["X_Lon"]
+        lats = data["Y_Lat"]
+        for lon, lat in zip(lons, lats):
+            province, city = get_city(lon=lon, lat=lat)
+            province_list.append(province)
+            city_list.append(city)
+        data["province"] = province_list
+        data["city"] = city_list
+        data = data.iloc[:,4:].groupby(by=["province", "city"]).mean().reset_index()
+        data.to_csv(result_path+os.path.basename(file_path), encoding="utf-8", index=False)
+        
+
+

+ 38 - 0
chongqing_pm.py

@@ -0,0 +1,38 @@
+import pandas as pd
+from glob import glob
+import os
+
+def pollutant_chongqing_handle():
+    path = "result_O3"
+    data = pd.read_csv(path+".csv")
+    # 找到province列等于'重庆市'的行
+    chongqing_rows = data[data['province'] == '重庆市']
+    # 求这些行除了'A'列和'B'列之外的其他列的平均值
+    avg_values = chongqing_rows.iloc[:, 2:].mean()
+    insert = pd.DataFrame([avg_values])
+    # 增加前两行
+    insert['province'] = '重庆市'
+    insert['city'] = '重庆市'
+    df = pd.concat([data,insert])
+    df.to_csv(path+"_p.csv", index=False)
+
+def aba_chongqing_handle():
+    path = "aba627/result/"
+    files = glob(path+"*.csv")
+    for file in files:
+        data = pd.read_csv(file)
+        # 找到province列等于'重庆市'的行
+        chongqing_rows = data[data['province'] == '重庆市']
+        # 求这些行除了'A'列和'B'列之外的其他列的平均值
+        avg_values = chongqing_rows.iloc[:, 3:].mean()
+        insert = pd.DataFrame([avg_values])
+        # 增加前两行
+        insert['province'] = '重庆市'
+        insert['city'] = '重庆市'
+        df = pd.concat([data,insert])
+        tmp = os.path.basename(file)
+        file_name, extension = os.path.splitext(tmp)
+        df.to_csv(path+file_name+"_p"+extension, index=False)
+
+if __name__ == "__main__":
+    aba_chongqing_handle()

+ 517 - 0
data_preprocess.r

@@ -0,0 +1,517 @@
+# NHANES
+# install.packages("caret", repos="https://mirrors.ustc.edu.cn/CRAN/")
+library(caret)
+library(nhanesA)
+library(knitr)
+library(tidyverse)
+library(plyr)
+library(dplyr)
+library(foreign)
+
+path = "NHANES/2017-2018/"
+year = "_J"
+# year_ = "2001-2002"
+
+DEMO <- read.xport(paste0(path , "DEMO",year,".XPT"))
+
+#选取数据发布号、样本人员的访谈和检查状态、性别、年龄、种族、出生地
+#社会经济地位数据:家庭 PIR、家庭年度收入
+if(year =="" || year == "_B" || year == "_C" || year == "_D"){
+  DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN, INDFMPIR, INDFMINC)
+}else if(year =="_E" || year == "_F"){
+  DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN2, INDFMPIR, INDHHIN2)
+}else{
+  DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN4, INDFMPIR, INDHHIN2)
+}
+print(names(DEMO)[9])
+names(DEMO)[7] <- "DMDBORN"
+names(DEMO)[9] <- "INDFMINC"
+
+# #收入
+# if(year=="_E" | year=="_F" | year=="_H" | year=="_G" |year=="_I" | year=="_J"){
+#   INQ <- read.xport(paste0(path, "INQ",year,".XPT"))
+#   data <- join_all(list(data, INQ), by = "SEQN", type = "left")
+# }
+
+
+#重量、站立高度、体重指数
+BMX <- read.xport(paste0(path, "BMX",year,".XPT"))
+BMX <- BMX %>% select(SEQN, BMXWT , BMXHT , BMXBMI )
+
+data <- join_all(list(DEMO, BMX), by = "SEQN", type = "left")
+
+#收缩压平均值、舒张压平均值、60 秒心率
+BPX <- read.xport(paste0(path, "BPX",year,".XPT"))
+BPX <- BPX %>% select(SEQN, BPXSY1, BPXDI1, BPXCHR)
+data <- join_all(list(data, BPX), by = "SEQN", type = "left")
+
+#空腹血浆葡萄糖
+if(year ==""){
+  LAB10AM <- read.xport(paste0(path, "LAB10AM",year,".XPT"))
+  LAB10AM <- LAB10AM %>% select(SEQN, LBXGLUSI )
+}else if (year == "_B"){
+  LAB10AM <- read.xport(paste0(path, "L10AM",year,".XPT"))
+  LAB10AM <- LAB10AM %>% select(SEQN, LBXGLUSI )
+}else if (year == "_C"){
+  LAB10AM <- read.xport(paste0(path, "L10AM",year,".XPT"))
+  LAB10AM <- LAB10AM %>% select(SEQN, LBDGLUSI)
+}else{
+  LAB10AM <- read.xport(paste0(path, "GLU",year,".XPT"))
+  LAB10AM <- LAB10AM %>% select(SEQN, LBDGLUSI)
+}
+names(LAB10AM)[2] <- "LBDGLUSI"
+
+data <- join_all(list(data, LAB10AM), by = "SEQN", type = "left")
+
+#尿白蛋白
+if (year == ""){
+  LAB16 <- read.xport(paste0(path, "LAB16",year,".XPT"))
+}else if(year == "_B" | year == "_C"){
+  LAB16 <- read.xport(paste0(path, "L16",year,".XPT"))
+}else{
+  LAB16 <- read.xport(paste0(path, "ALB_CR",year,".XPT"))
+}
+LAB16 <- LAB16 %>% select(SEQN, URXUMA)
+
+data <- join_all(list(data, LAB16), by = "SEQN", type = "left")
+
+#血常规:白细胞计数 (SI)1000 个细胞/uL、淋巴细胞计数、单核细胞数、红细胞计数 SI(百万细胞/uL)、血红蛋白 (g/dL)、血小板计数 (1000 个细胞/uL)
+#、嗜酸性粒细胞数、嗜碱性粒细胞数、Segmented neutrophils number(分叶中性粒细胞数量)1000 细胞/uL
+if (year == ""){
+  LAB25 <- read.xport(paste0(path, "LAB25",year,".XPT"))
+}else if(year == "_B" | year == "_C"){
+  LAB25 <- read.xport(paste0(path, "L25",year,".XPT"))
+}else{
+  LAB25 <- read.xport(paste0(path, "CBC",year,".XPT"))
+}
+LAB25 <- LAB25 %>% select(SEQN, LBXWBCSI, LBDLYMNO, LBDMONO, LBXRBCSI, LBXHGB, LBXPLTSI, LBDEONO, LBDBANO, LBDNENO)
+#计算炎症指标
+LAB25$SIRI <- LAB25$LBDNENO*LAB25$LBDMONO/LAB25$LBDLYMNO
+LAB25$SII <- LAB25$LBXPLTSI*LAB25$LBDNENO/LAB25$LBDLYMNO
+
+data <- join_all(list(data, LAB25), by = "SEQN", type = "left")
+
+#血脂:甘油三酯(mmol/L))、低密度脂蛋白胆固醇(mmol/L))、总胆固醇、高密度脂蛋白胆固醇(mmol/L)、总胆固醇、甘油三酯
+#肾功能:血尿素氮、肌酐
+#肝功能:ALT、AST、总胆红素
+if (year == ""){
+  LAB13AM <- read.xport(paste0(path, "LAB13AM",year,".XPT"))
+}else if(year == "_B" | year == "_C"){
+  LAB13AM <- read.xport(paste0(path, "L13AM",year,".XPT"))
+}else{
+  LAB13AM <- read.xport(paste0(path, "TRIGLY",year,".XPT"))
+}
+LAB13AM <- LAB13AM %>% select(SEQN, LBDTRSI, LBDLDLSI)
+data <- join_all(list(data, LAB13AM), by = "SEQN", type = "left")
+
+if(year ==""){
+  LAB13 <- read.xport(paste0(path, "LAB13",year,".XPT"))
+}else if(year == "_B" | year == "_C"){
+  LAB13 <- read.xport(paste0(path, "L13",year,".XPT"))
+}else{
+  LAB13 <- read.xport(paste0(path, "TCHOL",year,".XPT"))
+}
+if(year=="" | year=="_B"){
+  LAB13 <- LAB13 %>% select(SEQN, LBDTCSI, LBDHDLSI)
+}else if(year == "_C"){   #Direct HDL-Cholesterol 
+  LAB13 <- LAB13 %>% select(SEQN, LBDTCSI, LBDHDDSI)
+}else{
+  LAB13 <- LAB13 %>% select(SEQN, LBDTCSI)
+  HDL <- read.xport(paste0(path, "HDL",year,".XPT"))
+  HDL <- HDL %>% select(SEQN, LBDHDDSI)
+  LAB13 <- join_all(list(LAB13, HDL), by = "SEQN", type = "left")
+}
+names(LAB13)[3] <- "LBDHDDSI"
+data <- join_all(list(data, LAB13), by = "SEQN", type = "left")
+
+#计算炎症指标
+data$MHR <- data$LBDMONO/data$LBDHDDSI
+data$NHR <- data$LBDNENO/data$LBDHDDSI
+data$PHR <- data$LBXPLTSI/data$LBDHDDSI
+data$LHR <- data$LBDLYMNO /data$LBDHDDSI
+
+# 同型半胱氨酸(1999-2006),其他年份为空
+if(year == "" || year == "_D" || year == "_C" || year == "_B"){
+  if(year == ""){
+    LAB06 <- read.xport(paste0(path, "LAB06",year,".XPT"))
+    LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
+  }else if(year == "_B"){
+    LAB06 <- read.xport(paste0(path, "L06",year,".XPT"))
+    LAB06 <- LAB06 %>% select(SEQN, LBDHCY)
+  }else if(year == "_C"){
+    LAB06 <- read.xport(paste0(path, "L06MH",year,".XPT"))
+    LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
+  }else{
+    LAB06 <- read.xport(paste0(path, "HCY",year,".XPT"))
+    LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
+  }
+  names(LAB06)[2] <- "LBXHCY"
+  data <- join_all(list(data, LAB06), by = "SEQN", type = "left")
+}else{
+  data$LBXHCY <- NA
+}
+
+if(year == ""){
+  LAB18 <- read.xport(paste0(path, "LAB18",year,".XPT"))
+}else if(year == "_B" || year == "_C"){
+  LAB18 <- read.xport(paste0(path, "L40",year,".XPT"))
+}else{
+  LAB18 <- read.xport(paste0(path, "BIOPRO",year,".XPT"))
+}
+LAB18 <- LAB18 %>% select(SEQN, LBDSCHSI, LBDSTRSI, LBDSBUSI, LBDSCRSI, LBXSATSI, LBXSASSI, LBDSTBSI )
+data <- join_all(list(data, LAB18), by = "SEQN", type = "left")
+
+#甲状腺功能:促甲状腺激素
+if(year == "" || year == "_B" ||year =="_E" || year =="_F" || year =="_G"){
+  if(year == ""){
+    LAB18T4 <- read.xport(paste0(path, "LAB18T4",year,".XPT"))
+    LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH)
+  }else if(year == "_B"){
+    LAB18T4 <- read.xport(paste0(path, "L40T4",year,".XPT"))
+    LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH)
+  }else if (year =="_E" || year =="_F" || year =="_G"){
+    LAB18T4 <- read.xport(paste0(path, "THYROD",year,".XPT"))
+    LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH1)
+  }
+  names(LAB18T4)[2] <- "LBXTSH"
+  data <- join_all(list(data, LAB18T4), by = "SEQN", type = "left")
+}else{
+  data$LBXTSH <- NA
+}
+
+#甲状腺功能:游离甲状腺素(2007-2012)
+if(year == "_E" || year == "_F" || year == "_G"){
+  THYROD <- read.xport(paste0(path, "THYROD",year,".XPT"))
+  THYROD <- THYROD %>% select(SEQN, LBDT4FSI)
+  data <- join_all(list(data, THYROD), by = "SEQN", type = "left")
+}else{
+  data$LBDT4FSI <- NA
+}
+
+#吸烟史
+#成人 20+
+SMQ <- read.xport(paste0(path, "SMQ",year,".XPT"))
+SMQ <- SMQ %>% select(SEQN, SMQ020, SMD030, SMQ040, SMQ050Q, SMQ050U, SMD057)
+data <- join_all(list(data, SMQ), by = "SEQN", type = "left")
+
+#饮酒史 20+
+ALQ <- read.xport(paste0(path, "ALQ",year,".XPT"))
+if(year == "" || year == "_B" ||year =="_E" || year =="_F" || year =="_C" || year =="_D"){
+  ALQ <- ALQ %>% select(SEQN, ALQ150, ALQ130)
+}else{
+  ALQ <- ALQ %>% select(SEQN, ALQ151, ALQ130)
+}
+names(ALQ)[2] <- "ALQ150"
+data <- join_all(list(data, ALQ), by = "SEQN", type = "left")
+
+#饮食习惯
+DBQ <- read.xport(paste0(path, "DBQ",year,".XPT"))
+if(year == "" || year == "_B" ||year =="_E" || year =="_C" || year =="_D"){
+  if(year == ""){
+    DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBQ070A, DBQ070B, DBQ070C, DBQ390, DBD400, DBD410, DBQ420, 
+        DBQ070D, DBD195, DBQ220A, DBQ220B, DBQ220C, DBQ220D, DBD235A, DBD235B, DBD235C, DBQ300, DBQ330, DBD360, DBD370, DBD380)
+  }else if (year == "_B"){
+    DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBD071A, DBD071B, DBD071C, DBQ390, DBQ400, DBD411, DBD421,
+        DBD071D, DBD196, DBD221A, DBD221B, DBD221C, DBD221D, DBD235AE, DBD235BE, DBD235CE, DBD301, DBQ330, DBQ360, DBQ370, DBD381)
+  }else if(year == "_C"){
+    DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBQ071A, DBQ071B, DBQ071C, DBQ390, DBQ400, DBD411, DBQ421,
+        DBQ071D, DBD197, DBQ221A, DBQ221B, DBQ221C, DBQ221D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
+  }else if(year == "_D" || year == "_E"){
+    DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBD072A, DBD072B, DBD072C, DBQ390, DBQ400, DBD411, DBQ421,
+        DBD072D, DBQ197, DBD222A, DBD222B, DBD222C, DBD222D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
+  }
+}else{
+  DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD041, DBD050, DBD061, DBQ073A, DBQ073B, DBQ073C, DBQ390, DBQ400, DBD411, DBQ421,
+      DBQ073D, DBQ197, DBQ223A, DBQ223B, DBQ223C, DBQ223D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
+}
+names(DBQ) <- c("SEQN", "DBQ010", "DBD030", "DBD041", "DBD050", "DBD061", "DBQ073A", "DBQ073B", "DBQ073C", "DBQ390", "DBQ400", "DBD411", "DBQ421",
+      "DBQ073D", "DBQ197", "DBQ223A", "DBQ223B", "DBQ223C", "DBQ223D", "DBQ235A", "DBQ235B", "DBQ235C", "DBQ301", "DBQ330", "DBQ360", "DBQ370", "DBD381") 
+data <- join_all(list(data, DBQ), by = "SEQN", type = "left")
+
+#体力活动
+PAQ <- read.xport(paste0(path, "PAQ",year,".XPT"))
+PAQ[is.na(PAQ)] <- 0
+if(year == ""){
+  PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ480, PAQ560, PAD570, PAQ580)
+  PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
+  PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
+  PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
+  PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
+  PAQ_s$PAQ480 <- ifelse(PAQ_s$PAQ480 < 6, 1.2 * PAQ_s$PAQ480, 0)
+  PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
+  PAQ_s$PAD570 <- ifelse(PAQ_s$PAD570 < 6, PAQ_s$PAD570, 0)
+  PAQ_s$PAQ580 <- ifelse(PAQ_s$PAQ580 < 6, 1.5 * PAQ_s$PAQ580, 0)
+  PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
+}else if(year =="_B"){
+  PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ560, PAD590, PAD600)
+  PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
+  PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
+  PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
+  PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
+  PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
+  PAQ_s$PAD590 <- ifelse(PAQ_s$PAD590 < 6, PAQ_s$PAD590, 0)
+  PAQ_s$PAD600 <- ifelse(PAQ_s$PAD600 < 6, 1.5 * PAQ_s$PAD600, 0)
+  PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
+}else if(year =="_C" || year =="_D"){
+  PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ560, PAD590, PAD600)
+  PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
+  PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
+  PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
+  PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
+  PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
+  PAQ_s$PAD590 <- ifelse(PAQ_s$PAD590 < 6, PAQ_s$PAD590, 0)
+  PAQ_s$PAD600 <- ifelse(PAQ_s$PAD600 < 6, 1.5 * PAQ_s$PAD600, 0)
+  PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
+}else{
+  PAQ_s <- PAQ %>% select(SEQN, PAD615, PAD630, PAD645, PAD660, PAD675)
+  PAQ_s$PAD615 <- ifelse(PAQ_s$PAD615 < 841, 8 * PAQ_s$PAD615, 0)
+  PAQ_s$PAD630 <- ifelse(PAQ_s$PAD630 < 841, 4 * PAQ_s$PAD630, 0)
+  PAQ_s$PAD645 <- ifelse(PAQ_s$PAD645 < 661, 4 * PAQ_s$PAD645, 0)
+  PAQ_s$PAD660 <- ifelse(PAQ_s$PAD660 < 481, 8 * PAQ_s$PAD660, 0)
+  PAQ_s$PAD675 <- ifelse(PAQ_s$PAD675 < 541, 4 * PAQ_s$PAD675, 0)
+  PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
+  PAQ$PAAQUEX <- NA
+}
+PAQ <- PAQ %>% select(SEQN, SCORE, PAAQUEX)
+data <- join_all(list(data, PAQ), by = "SEQN", type = "left")
+
+#睡眠问题
+if(year != "" && year != "_B" && year != "_C"){
+  SLQ <- read.xport(paste0(path, "SLQ",year,".XPT"))
+  SLQ <- SLQ %>% select(SEQN, SLQ050)
+  data <- join_all(list(data, SLQ), by = "SEQN", type = "left")
+}else{
+  data$SLQ050 <- NA
+}
+
+
+#心理健康
+if (year =="" || year == "_B" || year == "_C"){
+  #心理健康 - 抑郁症:抑郁评分
+  if(year==""){
+    CIQMDEP <- read.xport(paste0(path, "CIQMDEP",year,".XPT"))
+  }else if (year == "_B" | year == "_C"){
+    CIQMDEP <- read.xport(paste0(path, "CIQDEP",year,".XPT"))
+  }
+  CIQMDEP <- CIQMDEP %>% select(SEQN, CIDDSCOR)
+  data <- join_all(list(data, CIQMDEP), by = "SEQN", type = "left")
+
+  # #精神健康 - 广泛性焦虑症:焦虑评分
+  # CIQGAD <- read.xport(paste0(path, "CIQGAD",year,".XPT"))
+  # CIQGAD <- CIQGAD %>% select(SEQN, CIDGSCOR)
+  # data <- join_all(list(data, CIQGAD), by = "SEQN", type = "left")
+
+  # #心理健康 - 恐慌症:恐慌评分
+  # if(year==""){
+  #   CIQPANIC <- read.xport(paste0(path, "CIQPANIC",year,".XPT"))
+  # }else{
+  #   CIQPANIC <- read.xport(paste0(path, "CIQPAN",year,".XPT"))
+  # }
+  # CIQPANIC <- CIQPANIC %>% select(SEQN, CIDPSCOR)
+  # data <- join_all(list(data, CIQPANIC), by = "SEQN", type = "left")
+}else{
+  DPQ <- read.xport(paste0(path, "DPQ",year,".XPT"))
+  DPQ <- DPQ[, -which(names(DPQ) == "DPQ100")]
+  DPQ <- na.omit(DPQ)
+  DPQ$CIDDSCOR <- apply(DPQ[, -1], 1, function(x) sum(x[x < 7]))
+  DPQ <- DPQ %>% select(SEQN, CIDDSCOR)
+  data <- join_all(list(data, DPQ), by = "SEQN", type = "left")
+}
+
+#社会支持
+if(year =="" || year == "_B" || year == "_C"){
+  SSQ <- read.xport(paste0(path, "SSQ",year,".XPT"))
+}else if(year =="_D" || year == "_E"){
+  SSQ <- read.xport(paste0(path, "SSQ",year,".XPT"))
+  SSQ <- SSQ[, !(names(SSQ) == "SSD044")]
+}else{
+  SSQ[c("SEQN", "SSQ011", "SSQ021A", "SSQ021B", "SSQ021C", "SSQ021D", "SSQ021E", "SSQ021F", "SSQ021G", "SSQ021H", "SSQ021I", "SSQ021J", "SSQ021K",
+      "SSQ021L", "SSQ021M", "SSQ021N", "SSQ031", "SSQ041", "SSQ051", "SSQ061")] <- NA
+}
+names(SSQ) <- c("SEQN", "SSQ011", "SSQ021A", "SSQ021B", "SSQ021C", "SSQ021D", "SSQ021E", "SSQ021F", "SSQ021G", "SSQ021H", "SSQ021I", "SSQ021J", "SSQ021K",
+      "SSQ021L", "SSQ021M", "SSQ021N", "SSQ031", "SSQ041", "SSQ051", "SSQ061")
+data <- join_all(list(data, SSQ), by = "SEQN", type = "left")
+
+#疾病诊断记录(有高血压、冠心病、心绞痛、心力衰竭、心脏病发作)
+MCQ <- read.xport(paste0(path, "MCQ",year,".XPT"))
+if(year =="" || year == "_B" || year == "_C" || year =="_D" || year == "_E" || year == "_F"){
+  MCQ <- MCQ %>% select(SEQN, MCQ160B, MCQ180B, MCQ160C, MCQ180C, MCQ160D, MCQ180D, MCQ160E, MCQ180E)
+}else if(year =="_G" || year == "_H" || year == "_I" ){
+  MCQ <- MCQ %>% select(SEQN, MCQ160B, MCQ180B, MCQ160C, MCQ180C, MCQ160D, MCQ180D, MCQ160E, MCQ180E)
+}else{
+  MCQ <- MCQ %>% select(SEQN, MCQ160B, MCD180B, MCQ160C, MCD180C, MCQ160D, MCD180D, MCQ160E, MCD180E)
+}
+names(MCQ) <- c("SEQN", "MCQ160B", "MCD180B", "MCQ160C", "MCD180C", "MCQ160D", "MCD180D", "MCQ160E", "MCD180E")
+data <- join_all(list(data, MCQ), by = "SEQN", type = "left")
+
+#用药记录,天数、用量
+#心内常用药
+medicine = c("HYDROCHLOROTHIAZIDE", "VALSARTAN", "AMIODARONE", "AMLODIPINE", "ATORVASTATIN",
+   "BENAZEPRIL", "OLMESARTAN", "PERINDOPRIL","TELMISARTAN","BENAZEPRIL","BENDROFLUMETHIAZIDE",
+   "BEZAFIBRATE","BISOPROLOL","Captopril","Carvedilol", "EPINEPHRINE", "QUINIDINE", "Digoxin", 
+   "Diltiazem", "Enalapril", "Eprosartan", "ERTUGLIFLOZIN", "SITAGLIPTIN", "EZETIMIBE", "SIMVASTATIN",
+   "FENOFIBRIC ACID", "Furosemide", "GLICLAZIDE", "GLIPIZIDE", "METFORMIN", "HYDRALAZINE", "ISOSORBIDE DINITRATE",
+   "IRBESARTAN", "METOPROLOL", "PROPRANOLOL", "ISOSORBIDE MONONITRATE", "NIFEDIPINE", "NITROGLYCERIN",
+   "PITAVASTATIN", "PRAVASTATIN", "PRAZOSIN", "PROCAINAMIDE", "PROPAFENONE", "SACUBITRIL", "SOTALOL", "SPIRONOLACTONE",
+   "TICAGRELOR", "TORSEMIDEMIDE", "VERAPAMIL", "TRIAMTERENE", "WARFARIN")
+RXQ_RX <- read.xport(paste0(path, "RXQ_RX",year,".XPT"))
+if(year == "" || year == "_B"){
+  day_RXQ_RX <- RXQ_RX %>%
+    select(SEQN, RXD240B, RXD260) %>%
+    na.omit(.) %>%
+    pivot_wider(names_from = RXD240B, values_from = RXD260,
+    names_prefix = "day_",values_fill = 0, values_fn = list(RXD260 = sum))
+  number_RXQ_RX <- RXQ_RX %>%
+    select(SEQN, RXD240B, RXD295) %>%
+    na.omit(.) %>%
+    pivot_wider(names_from = RXD240B, values_from = RXD295,
+    names_prefix = "number_",values_fill = 0, values_fn = list(RXD295 = sum))
+}else{
+  day_RXQ_RX <- RXQ_RX %>%
+    select(SEQN, RXDDRUG, RXDDAYS) %>%
+    na.omit(.) %>%
+    pivot_wider(names_from = RXDDRUG, values_from = RXDDAYS,
+    names_prefix = "day_",values_fill = 0, values_fn = list(RXDDAYS = sum))
+  number_RXQ_RX <- RXQ_RX %>%
+    select(SEQN, RXDDRUG, RXDCOUNT) %>%
+    na.omit(.) %>%
+    pivot_wider(names_from = RXDDRUG, values_from = RXDCOUNT,
+    names_prefix = "number_",values_fill = 0, values_fn = list(RXDCOUNT = sum))
+}
+for (item in medicine){
+  med_columns <- grep(item, names(day_RXQ_RX), value = TRUE)
+  tmp = day_RXQ_RX[, c("SEQN", med_columns)]
+  tmp[[paste0("day_", item)]] <- apply(tmp[, -1], 1, function(x) sum(x))
+  tmp <- tmp %>% select(SEQN, paste0("day_", item))
+  data <- join_all(list(data, tmp), by = "SEQN", type = "left")
+  med_columns <- grep(item, names(number_RXQ_RX), value = TRUE)
+  tmp = number_RXQ_RX[, c("SEQN", med_columns)]
+  tmp[[paste0("num_", item)]] <- apply(tmp[, -1], 1, function(x) sum(x))
+  tmp <- tmp %>% select(SEQN, paste0("num_", item))
+  data <- join_all(list(data, tmp), by = "SEQN", type = "left")
+}
+
+#就诊情况
+HUQ <- read.xport(paste0(path, "HUQ",year,".XPT"))
+if(year == "" || year == "_B" ||year =="_E" || year =="_C" || year =="_D" || year =="_F"|| year =="_G"){
+  if(year == ""){
+    HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUQ070, HUD080, HUQ090)
+  }else if(year == "_B"){
+    HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUD070, HUQ080, HUQ090)
+  }else{
+    HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUQ071, HUD080, HUQ090)
+  }
+}else{
+  HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ041, HUQ051, HUQ061, HUQ071, HUD080, HUQ090)
+}
+names(HUQ) <- c("SEQN", "HUQ010", "HUQ020", "HUQ030", "HUQ041", "HUQ051", "HUQ061", "HUQ071", "HUD080", "HUQ090")
+data <- join_all(list(data, HUQ), by = "SEQN", type = "left")
+
+#健康状况  胃病、肠病、HIV病毒、流感等  最近献血
+HSQ <- read.xport(paste0(path, "HSQ",year,".XPT"))
+if(year == ""){
+  HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSQ570, HSQ580, HSQ590, HSAQUEX)
+}else if(year == "_B"){
+  HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSD570, HSQ580, HSQ590, HSAQUEX)
+}else{
+  HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSQ571, HSQ580, HSQ590, HSAQUEX)
+}
+names(HSQ) <- c("SEQN", "HSQ500", "HSQ510", "HSQ520", "HSQ571", "HSQ580", "HSQ590", "HSAQUEX")
+data <- join_all(list(data, HSQ), by = "SEQN", type = "left")
+
+#身体机能
+PFQ <- read.xport(paste0(path, "PFQ",year,".XPT"))
+if(year == ""){
+  PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFQ040, PFQ048, PFQ050, PFQ055, PFQ056, 
+          PFQ059, PFQ060A, PFQ060B, PFQ060C,PFQ060D,PFQ060E,PFQ060F,PFQ060G,PFQ060H,PFQ060I,PFQ060J,PFQ060K,
+          PFQ060L,PFQ060M,PFQ060N,PFQ060O,PFQ060P,PFQ060Q,PFQ060R,PFQ060S, PFD067A, PFD067B,PFD067C,PFD067D,
+          PFD067E, PFQ090)
+}else if(year == "_B"){
+  PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFD040, PFQ048, PFQ050, PFQ055, PFQ056, 
+          PFQ059, PFQ060A, PFQ060B, PFQ060C,PFQ060D,PFQ060E,PFQ060F,PFQ060G,PFQ060H,PFQ060I,PFQ060J,PFQ060K,
+          PFQ060L,PFQ060M,PFQ060N,PFQ060O,PFQ060P,PFQ060Q,PFQ060R,PFQ060S, PFD067A, PFD067B,PFD067C,PFD067D,
+          PFD067E, PFQ090)
+}else{
+  PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFQ041, PFQ049, PFQ051, PFQ054, PFQ057,
+          PFQ059, PFQ061A, PFQ061B,PFQ061C,PFQ061D,PFQ061E,PFQ061F,PFQ061G,PFQ061H,PFQ061I,PFQ061J,PFQ061K,
+          PFQ061L,PFQ061M,PFQ061N,PFQ061O,PFQ061P,PFQ061Q, PFQ061R,PFQ061S, PFQ063A, PFQ063B,PFQ063C,PFQ063D,
+          PFQ063E, PFQ090)
+}
+names(PFQ) <- c("SEQN", "PFQ020", "PFQ030", "PFQ041", "PFQ049", "PFQ051", "PFQ054", "PFQ057",
+          "PFQ059", "PFQ061A", "PFQ061B","PFQ061C","PFQ061D","PFQ061E","PFQ061F","PFQ061G","PFQ061H","PFQ061I","PFQ061J","PFQ061K",
+          "PFQ061L","PFQ061M","PFQ061N","PFQ061O","PFQ061P","PFQ061Q", "PFQ061R","PFQ061S", "PFQ063A", "PFQ063B","PFQ063C","PFQ063D",
+          "PFQ063E", "PFQ090")
+data <- join_all(list(data, PFQ), by = "SEQN", type = "left")
+
+#认知功能
+if(year=="" | year=="_B" | year=="_H" | year=="_G"){
+  CFQ <- read.xport(paste0(path, "CFQ",year,".XPT"))
+  if(year == "" || year == "_B"){
+    CFQ <- CFQ %>% select(SEQN, CFDRIGHT)
+  }else{
+    CFQ <- CFQ %>% select(SEQN, CFDDS)
+  }
+  names(CFQ) <- c("SEQN", "CFDDS")
+  data <- join_all(list(data, CFQ), by = "SEQN", type = "left")
+}else{
+  data$CFDDS <- NA
+}
+
+mortality_result = read.csv(file = paste0(path, "mortality_result.csv"))
+
+data <- join_all(list(data, mortality_result), by = "SEQN", type = "left")
+
+write.csv(data, file = paste0(path, "result", year, ".csv"), row.names = FALSE)
+
+csv_files <- list.files(path = "NHANES", pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
+df_combined <- NA
+# 确保读取文件的路径是完整的
+if (length(csv_files) > 0) {
+  for (file in csv_files) {
+    if (grepl("mortality", file)){
+      next
+    }
+    # 读取每个.csv文件
+    data <- read.csv(file, stringsAsFactors = FALSE)
+    print(ncol(data))
+    if (length(df_combined) == 0){
+      df_combined <- data
+    }else{
+      df_combined <- rbind(data, df_combined)
+    }
+    # 这里可以添加代码进行数据处理
+    print(paste("Read file:", file))
+  }
+}
+write.csv(df_combined, file = paste0("NHANES/", "result_all", ".csv"), row.names = FALSE)
+# data = read.csv(file = paste0(path, "result.csv"))
+
+# head(data)
+
+#加入污染物
+library(ncdf4)
+library(raster)
+
+nc <- nc_open("CHAP_PM2.5_Y1K_2009_V4.nc")
+
+data <- ncvar_get(nc, "PM2.5")
+print(data)
+lon <- ncvar_get(nc, "lon")
+tori <- ncvar_get(nc, "time")
+
+tunites <- ncatt_get(nc, "time", )
+
+
+writeRaster(x = nc, filename = 'pre1.tif', format='GTiff', overwrite=TRUE)
+
+
+
+
+
+# China_Health_and_Retirement_Longitudinal_Study_CHARLS
+# demo_china <- read.dta("CHARLS/CHARLS2011_Dataset/demographic_background.dta")
+
+# household_income_china <- read.dta("CHARLS/CHARLS2011_Dataset/household_income.dta")
+
+# # UKDA
+# demo_UK <- read.dta("UKDA-5050-stata/stata/stata13_se/wave_0_1998_data.dta")

+ 8 - 0
different.py

@@ -0,0 +1,8 @@
+import pandas as pd
+from glob import glob
+
+year = "2011"
+path = "/root/r_base/CHARLS"
+
+if __name__ == "__main__":
+    year = "2011"

+ 246 - 0
dotproduct_attentionMLP.py

@@ -0,0 +1,246 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_addons as tfa
+import matplotlib.pyplot as plt
+
+class PatchExtract(layers.Layer): #提取patch
+    def __init__(self, patch_size, **kwargs):
+        super(PatchExtract, self).__init__(**kwargs)
+        self.patch_size = patch_size
+
+    def call(self, images):
+        batch_size = tf.shape(images)[0]
+        patches = tf.image.extract_patches(
+            images=images,
+            sizes=(1, self.patch_size, self.patch_size, 1),
+            strides=(1, self.patch_size, self.patch_size, 1),
+            rates=(1, 1, 1, 1),
+            padding="VALID",
+        )
+        patch_dim = patches.shape[-1]
+        patch_num = patches.shape[1]
+        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))
+
+
+class PatchEmbedding(layers.Layer): #patch转化为地位矩阵embedding
+    def __init__(self, num_patch, embed_dim, **kwargs):
+        super(PatchEmbedding, self).__init__(**kwargs)
+        self.num_patch = num_patch
+        self.proj = layers.Dense(embed_dim)
+        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)
+
+    def call(self, patch):
+        pos = tf.range(start=0, limit=self.num_patch, delta=1)
+        return self.proj(patch) + self.pos_embed(pos)
+    
+    
+def dotproduct_attention(
+    x, dim, num_heads, dim_coefficient=4, attention_dropout=0, projection_dropout=0
+): #点积计算attention值,输入为patch批次矩阵((2*2),数量,通道数),embedding_dim,
+    #超参数num_heads(保证embedding_num和num_head是倍数关系)
+    #和常规参数embedding系数、attention单元的dropout率(去掉一些运算过程中产生的参数,设为0是不去除)和映射层的dropout
+    _, num_patch, channel = x.shape #获取patch数量和通道个数
+    assert dim % num_heads == 0 #确保embedding_num和num_head是倍数关系
+    num_heads = num_heads * dim_coefficient #
+
+    x = layers.Dense(dim * dim_coefficient)(x) #定义一个网络层,执行的操作是func(input*kernel)+bias,这里的*是计算点积,
+                                               #计算patches与embedding的点积并赋值给patches
+    x = tf.reshape(
+        x, shape=(-1, num_patch, num_heads, dim * dim_coefficient // num_heads)
+    )  #将patches重新还原为原来的维度
+    x = tf.transpose(x, perm=[0, 2, 1, 3]) #求pathes的转置(高和列),并将转置的矩阵赋值给patches
+    attn = layers.Dense(dim // dim_coefficient)(x) #网络层,将转置后的patches计算点积产生attention向量
+    attn = layers.Softmax(axis=2)(attn) #softmax函数,计算attention向量
+    attn = attn / (1e-9 + tf.reduce_sum(attn, axis=-1, keepdims=True)) #计算attention值,tf.reduce_sum是按维度求和,
+                                                                        #通过attention向量计算attention值
+    attn = layers.Dropout(attention_dropout)(attn) #去除中间过程产生的参数
+    x = layers.Dense(dim * dim_coefficient // num_heads)(attn) #计算patche和attention值的点积
+    x = tf.transpose(x, perm=[0, 2, 1, 3]) #patches高和列转置
+    x = tf.reshape(x, [-1, num_patch, dim * dim_coefficient]) #复原patches为原来的维度
+    x = layers.Dense(dim)(x) #计算dim与patches的点积
+    x = layers.Dropout(projection_dropout)(x) #去除中间过程产生的参数
+    return x
+
+def mlp(x, embedding_dim, mlp_dim, drop_rate=0.2): #喂给MLP
+    x = layers.Dense(mlp_dim, activation=tf.nn.gelu)(x)
+    x = layers.Dropout(drop_rate)(x)
+    x = layers.Dense(embedding_dim)(x)
+    x = layers.Dropout(drop_rate)(x)
+    return x
+
+def transformer_encoder(
+    x,
+    embedding_dim,
+    mlp_dim,
+    num_heads,
+    dim_coefficient,
+    attention_dropout,
+    projection_dropout,
+    attention_type="dotproduct_attention",
+): #encoder步骤
+    residual_1 = x
+    x = layers.LayerNormalization(epsilon=1e-5)(x)
+    if attention_type == "dotproduct_attention":
+        x = dotproduct_attention(
+            x,
+            embedding_dim,
+            num_heads,
+            dim_coefficient,
+            attention_dropout,
+            projection_dropout,
+        )
+    elif attention_type == "self_attention":
+        x = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embedding_dim, dropout=attention_dropout
+        )(x, x)
+    x = layers.add([x, residual_1])
+    residual_2 = x
+    x = layers.LayerNormalization(epsilon=1e-5)(x)
+    x = mlp(x, embedding_dim, mlp_dim)
+    x = layers.add([x, residual_2])
+    return x
+
+def get_model(attention_type="dotproduct_attention"):
+    inputs = layers.Input(shape=input_shape)
+    x = data_augmentation(inputs)
+    x = PatchExtract(patch_size)(x)
+    x = PatchEmbedding(num_patches, embedding_dim)(x)
+    for _ in range(num_transformer_blocks):
+        x = transformer_encoder(
+            x,
+            embedding_dim,
+            mlp_dim,
+            num_heads,
+            dim_coefficient,
+            attention_dropout,
+            projection_dropout,
+            attention_type,
+        )
+
+    x = layers.GlobalAvgPool1D()(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+#加载数据
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+#设置超参数
+weight_decay = 0.0001
+learning_rate = 0.001
+label_smoothing = 0.1
+validation_split = 0.2
+batch_size = 128
+num_epochs = 50
+patch_size = 2  # 从原图提取patch的窗口大小2*2
+num_patches = (input_shape[0] // patch_size) ** 2  # patch数量
+embedding_dim = 64  # 隐藏单元数量
+mlp_dim = 64
+dim_coefficient = 4
+num_heads = 4
+attention_dropout = 0.2
+projection_dropout = 0.2
+num_transformer_blocks = 8  #transformer层的重复次数
+
+print(f"Patch size: {patch_size} X {patch_size} = {patch_size ** 2} ")
+print(f"Patches per image: {num_patches}")
+
+
+#数据增强
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(factor=0.1),
+        layers.RandomContrast(factor=0.1),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+#计算训练集的平均值和方差,便于正则化训练集
+data_augmentation.layers[0].adapt(x_train)
+
+#开始调用模型
+model = get_model(attention_type="dotproduct_attention")
+
+model.compile(
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=tfa.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    ),
+    metrics=[
+        keras.metrics.CategoricalAccuracy(name="accuracy"),
+        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=num_epochs,
+    validation_split=validation_split,
+)
+
+
+#画混淆矩阵
+from sklearn.metrics import confusion_matrix
+import itertools
+plt.rcParams['figure.figsize'] = [12,12]
+
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Blues):
+
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    print(cm)
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks[0::2], classes[0::2], rotation=0)
+    plt.yticks(tick_marks[0::2], classes[0::2])
+    '''
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt),
+               horizontalalignment="center",
+               color="white" if cm[i, j] > thresh else "black")
+    '''
+
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label')
+    plt.savefig('./picture/confusion_matrix.jpeg',dpi=1200,bbox_inches='tight')
+    plt.show()
+
+p_test = model.predict(x_test).argmax(axis=1)
+cm = confusion_matrix(y_test.argmax(axis=1), p_test)
+plot_confusion_matrix(cm, list(range(100)))
+
+#画损失函数
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.savefig('./picture/loss_function.jpeg',dpi=800,bbox_inches='tight')
+plt.show()

+ 185 - 0
nc2geotiff.py

@@ -0,0 +1,185 @@
+#;+
+#; :Author: Dr. Jing Wei (Email: weijing_rs@163.com)
+#;-
+import os
+from time import sleep
+# import gdal
+import netCDF4 as nc
+import numpy as np  
+from glob import glob
+import requests
+import json
+import pandas as pd
+import concurrent.futures
+# from osgeo import osr
+
+#Define work and output paths
+WorkPath = r'/root/r_base/O3'
+OutPath  = WorkPath
+
+#Define air pollutant type 
+#e.g., PM1, PM2.5, PM10, O3, NO2, SO2, and CO, et al.
+AP = 'O3'
+
+#Define spatial resolution 
+#e.g., 1 km ≈ 0.01 Degree
+SP = 0.01 #Degrees
+
+if not os.path.exists(OutPath):
+    os.makedirs(OutPath)
+path = glob(os.path.join(WorkPath, '*.nc'))
+
+#线程运行函数
+def thread_work(vll_work, start, end):
+    result_work_array = []
+    for value_work, lat_ind_work, lon_ind_work in vll_work:
+        params = {
+            "lng": "{:.6f}".format(lon[lon_ind_work]),
+            "lat": "{:.6f}".format(lat[lat_ind_work])
+        }
+        flag = True
+        while(flag):
+            try:
+                response_work = requests.get(url="http://103.116.120.27:9527/queryPoint",params = params)
+                flag = False
+            except Exception as e:
+                print(f"请求错误一次:{e}")
+                sleep(10)
+                pass
+        res_json_work = json.loads(response_work.text)
+        #坐标在国内
+        list_local_work = res_json_work['v']['list']
+        if len(list_local_work) > 0:
+            try:
+                if len(list_local_work) == 1:
+                    province_city_work = [local_work['ext_path'] for local_work in list_local_work if local_work['deep'] == '0']
+                    tmp_result_work = [province_city_work[0], province_city_work[0], value_work]
+                    result_work_array.append(tmp_result_work)
+                else:
+                    province_city_work = [local_work['ext_path'] for local_work in list_local_work if local_work['deep'] == '1']
+                    tmp_result_work = [province_city_work[0].split(" ")[0], province_city_work[0].split(" ")[1], value_work]
+                    result_work_array.append(tmp_result_work)
+            except Exception as e:
+                print("发生成异常"+json.dumps(list_local_work))
+        else:
+            print(f"这是一个空的坐标:{lon[lon_ind_work]},{lat[lat_ind_work]}\n")
+        # if len(result_work_array) % 100 == 0 :
+            # print(f"当前线程处理开始{start},结束{end}, 已经处理的个数为{len(result_work_array)}\n")
+    return result_work_array
+
+for file in path:
+    #全部年份的
+    file_path = "result.csv"
+    #提取出来年份
+    year = file.split("_")[4]
+    print(f"当前处理年份{year}")
+    f = nc.Dataset(file)   
+    #Read SDS data
+    data = np.array(f[AP][:]) 
+    #Define missing value: NaN or -999
+    data[data==65535] = np.nan #-999 
+    #Read longitude and latitude information
+    lon = np.array(f['lon'][:])
+    lat = np.array(f['lat'][:])
+    #获取非空索引
+    indices = np.where(~np.isnan(data))
+    #获取非空值
+    values = data[indices]
+    #拼接(value, lat, lon)
+    vll = list(zip(values, indices[0], indices[1]))
+    #继续索引记录文件地址
+    index_path = "index_"+year+".txt"
+    # 尝试以读取模式打开文件
+    try:
+        with open(index_path, 'r') as file:
+            # 如果文件存在,读取文件内容
+            index = file.readline()
+    # 如果文件不存在,则新建文件并写入内容
+    except FileNotFoundError:
+        index = 0
+    vll = vll[int(index):]
+    #将一年的数据拆分成多大一块
+    batch_size = 100000
+    total_len = len(vll)
+    if total_len == 0:
+        continue
+    batch_start = 0
+    # 多少个线程
+    max_workers = 10
+    with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
+        for i in range(total_len // batch_size + 1):
+            # 尝试以读取模式打开文件
+            try:
+                # 如果文件存在,读取文件内容
+                result_all = pd.read_csv(file_path)
+            # 如果文件不存在,则新建文件并写入内容
+            except FileNotFoundError:
+                result_all = []
+            batch_end = min(batch_start + batch_size, total_len)
+            vll_one = vll[batch_start:batch_end]
+            batch_start = batch_end
+            result_array = []
+            #并行调用接口获取坐标对应城市
+            start = 0
+            avg = len(vll_one)//max_workers
+            remainder = len(vll_one) % max_workers
+            all_task = []
+            for i in range(max_workers):
+                if i < remainder:
+                    end = start + avg + 1
+                else:
+                    end = start + avg
+                all_task.append(executor.submit(thread_work, vll_one[start:end], start, end))
+                start = end
+            for future in concurrent.futures.as_completed(all_task):
+                data = future.result()
+                result_array = result_array + data
+            #相同地区求平均
+            columns = ['province', 'city', year]
+            result_df = pd.DataFrame(result_array, columns=columns)
+            if len(result_all) == 0 :
+                result_all = result_df.groupby(['province', 'city']).mean().reset_index()
+            else:
+                result_one = result_df.groupby(['province', 'city']).mean().reset_index()
+                #合并
+                if year in result_all.columns:
+                    print("============新加的数据================")
+                    print(result_one)
+                    concatenated_df = pd.concat([result_all[['province', 'city', year]], result_one])
+                    # 使用 groupby 进行聚合
+                    grouped_df = concatenated_df.groupby(['province', 'city']).mean().reset_index()
+                    result_all = pd.merge(result_all, grouped_df, on=['province', 'city'], how='outer', suffixes=('', '_total'))
+                    #替换掉
+                    result_all[year] = result_all[year+"_total"]
+                    result_all = result_all.drop([year+"_total"], axis=1)
+                else:
+                    result_all = pd.merge(result_all, result_one, on=['province', 'city'], how="outer")
+            print("============合并后的数据================")
+            print(result_all.head())
+            result_all.to_csv("result.csv",index=False)
+            with open(index_path, 'w') as file:
+                # 如果文件存在,读取文件内容
+                file.write(str(batch_end+int(index)))
+    # LonMin,LatMax,LonMax,LatMin = lon.min(),lat.max(),lon.max(),lat.min()    
+    # N_Lat = len(lat) 
+    # N_Lon = len(lon)
+    # Lon_Res = SP #round((LonMax-LonMin)/(float(N_Lon)-1),2)
+    # Lat_Res = SP #round((LatMax-LatMin)/(float(N_Lat)-1),2)
+    # #Define Define output file
+    # fname = os.path.basename(file).split('.nc')[0]
+    # outfile = OutPath + '/{}.tif' .format(fname)        
+    #Write GeoTIFF
+    # driver = gdal.GetDriverByName('GTiff')    
+    # outRaster = driver.Create(outfile,N_Lon,N_Lat,1,gdal.GDT_Float32)
+    # outRaster.SetGeoTransform([LonMin-Lon_Res/2,Lon_Res,0,LatMax+Lat_Res/2,0,-Lat_Res])
+    # sr = osr.SpatialReference()
+    # sr.SetWellKnownGeogCS('WGS84')
+    # outRaster.SetProjection(sr.ExportToWkt())
+    # outRaster.GetRasterBand(1).WriteArray(data)
+    # print(fname+'.tif',' Finished')     
+    # #release memory
+    # del outRaster
+    f.close()
+# result_all.reset_index(inplace=True)
+# print(result_all.head())
+# result_all.to_csv("result.csv",index=False)

+ 38 - 0
night_light_data_preprocess.py

@@ -0,0 +1,38 @@
+import pandas as pd
+from glob import glob
+import os
+import re
+
+#文件夹
+folderpath = "night_light"
+
+# 读取各个年份的夜光数据
+files = os.listdir(folderpath)
+
+#读取地名与ID的对应文件
+ok_data_level3 = pd.read_csv("ok_data_level3.csv")
+# 新建空的存放最后的结果
+result_all = pd.DataFrame()
+for file_name in files:
+    #获取年份
+    match = re.search(r'(F\d{4}|SNNP\d{4})', file_name)
+    if match:
+        year_str = match.group(0)[-4:]
+        year = int(year_str)
+        data_one_year = pd.read_csv(os.path.join(folderpath, file_name))
+        #只需要id、ext_name、MEAN
+        if result_all.empty:
+            tmp_result = pd.merge(data_one_year, ok_data_level3, on="id", how="left")
+            result_all = tmp_result[["id", "ext_name", "MEAN"]]
+            #改列名
+            result_all.rename(columns={"MEAN":year}, inplace=True)
+        else:
+            result_all = pd.merge(result_all, data_one_year[["id", "MEAN"]], on="id", how="left")
+            #改列名
+            result_all.rename(columns={"MEAN":year}, inplace=True)
+    else:
+        print(f"no year find in file: {file_name}")
+    
+result_all.to_csv("night_light_result.csv", encoding="utf-8", index=False)    
+
+