root 2 ヶ月 前
コミット
a4a6f31e89
4 ファイル変更926 行追加67 行削除
  1. 16 17
      CHARLS_P/CHARLS_NL.py
  2. 50 47
      CHARLS_P/CHARLS_PM.py
  3. 676 0
      CHARLS_P/CHARLS_preprocess_main.py
  4. 184 3
      CLHLS_P/CLHLS_process.py

+ 16 - 17
CHARLS_P/CHARLS_NL.py

@@ -1,21 +1,20 @@
 import pandas as pd
 
-
-#读取CHARLS数据
-CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
+years = [2011, 2013,2015, 2018, 2020]
 #读取夜光数据
 pollutants_data = pd.read_csv("night_light_result.csv", encoding="utf-8")
-#处理哪一年的数据
-year = 2020
-#新增两列,分别为year的去年和前年的环境值
-# CHARLS_data[['last_year_pm2.5', "before_last_pm2.5"]]=''
-#开始筛选出year的数据
-CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
-#两个表合并
-table_merge = pd.merge(CHARLS_data_year, pollutants_data, left_on="city", right_on="ext_name", how='left')
-# table_merge_last.to_csv("123.csv",index=False)
-#更新CHARLS表
-CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_nl'] = table_merge[str(year-1)].values
-CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_nl'] = table_merge[str(year-2)].values
-CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
-print(year)
+for year in years:
+    #读取CHARLS数据
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
+    #新增两列,分别为year的去年和前年的环境值
+    # CHARLS_data[['last_year_pm2.5', "before_last_pm2.5"]]=''
+    #开始筛选出year的数据
+    CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+    #两个表合并
+    table_merge = pd.merge(CHARLS_data_year, pollutants_data, left_on="city", right_on="ext_name", how='left')
+    # table_merge_last.to_csv("123.csv",index=False)
+    #更新CHARLS表
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_nl'] = table_merge[str(year-1)].values
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_nl'] = table_merge[str(year-2)].values
+    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+    print(year)

+ 50 - 47
CHARLS_P/CHARLS_PM.py

@@ -2,58 +2,61 @@ import pandas as pd
 from glob import glob
 import os
 
-def pollutant_handle(CHARLS_data):
+def pollutant_handle(path):
+    years = [2011, 2013,2015, 2018, 2020]
     #读取污染物数据
-    pollutants_data = pd.read_csv("result_O3_p.csv")
-    #处理哪一年的数据
-    year = 2020
-    #开始筛选出year的数据
-    CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
-    #两个表合并
-    table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
-    #更新CHARLS表
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
-    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
-    print(year)
+    pollutants_data = pd.read_csv("pollution/result_O3_p.csv")
+    for year in years:
+        CHARLS_data = pd.read_csv(path)
+        print(CHARLS_data.info())
+        #开始筛选出year的数据
+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+        #两个表合并
+        table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
+        #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
+        CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+        print(year)
 
-def aba_handle(CHARLS_data):
-    #处理CHARLS数据的年份
-    year = 2020
-    path = "aba627/result/"
-    #读取污染物组分
-    last_year_file_name = path+str(year-1)+"_PM25_and_species_p.csv"
-    before_last_file_name = path+str(year-2)+"_PM25_and_species_p.csv"
-    last_year_pollutants_data = pd.read_csv(last_year_file_name)
-    before_last_pollutants_data = pd.read_csv(before_last_file_name)
-    #开始筛选出year的数据
-    CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
-    #和上一年的污染物组分文件合并
-    last_table_merge = pd.merge(CHARLS_data_year, last_year_pollutants_data, on=['province', 'city'], how='left')
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_SO4'] = last_table_merge["SO4"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NO3'] = last_table_merge["NO3"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NH4'] = last_table_merge["NH4"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_OM'] = last_table_merge["OM"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_BC'] = last_table_merge["BC"].values
-    #和上上年的污染物组分文件合并
-    before_last_table_merge = pd.merge(CHARLS_data_year, before_last_pollutants_data, on=['province', 'city'], how='left')
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_SO4'] = before_last_table_merge["SO4"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NO3'] = before_last_table_merge["NO3"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NH4'] = before_last_table_merge["NH4"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_OM'] = before_last_table_merge["OM"].values
-    CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_BC'] = before_last_table_merge["BC"].values
-    #更新CHARLS表
-    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
-    print(year)
+def aba_handle(path_data):
+    years = [2011, 2013,2015, 2018, 2020]
+    for year in years:
+        CHARLS_data = pd.read_csv(path_data)
+        path = "aba627/result/"
+        #读取污染物组分
+        last_year_file_name = path+str(year-1)+"_PM25_and_species_p.csv"
+        before_last_file_name = path+str(year-2)+"_PM25_and_species_p.csv"
+        last_year_pollutants_data = pd.read_csv(last_year_file_name)
+        before_last_pollutants_data = pd.read_csv(before_last_file_name)
+        #开始筛选出year的数据
+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+        #和上一年的污染物组分文件合并
+        last_table_merge = pd.merge(CHARLS_data_year, last_year_pollutants_data, on=['province', 'city'], how='left')
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_SO4'] = last_table_merge["SO4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NO3'] = last_table_merge["NO3"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_NH4'] = last_table_merge["NH4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_OM'] = last_table_merge["OM"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_BC'] = last_table_merge["BC"].values
+        #和上上年的污染物组分文件合并
+        before_last_table_merge = pd.merge(CHARLS_data_year, before_last_pollutants_data, on=['province', 'city'], how='left')
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_SO4'] = before_last_table_merge["SO4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NO3'] = before_last_table_merge["NO3"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_NH4'] = before_last_table_merge["NH4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_OM'] = before_last_table_merge["OM"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_BC'] = before_last_table_merge["BC"].values
+        #更新CHARLS表
+        CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+        print(year)
 
 if __name__ == "__main__":
     #读取CHARLS数据
-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
-    print(CHARLS_data.info())
-    # CHARLS_data1 = pd.read_csv("NHANES/result_all.csv")
-    # print(CHARLS_data1.info())
+    path = "CHARLS_data_pollutants.csv"
+    # CHARLS_data = pd.read_csv("CHARLS/result_all_new.csv")
+    # print(CHARLS_data.info())
+    # CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
     
     #处理污染物
-    # pollutant_handle(CHARLS_data)
+    # pollutant_handle(path)
     #处理PM2.5组分
-    # aba_handle(CHARLS_data)
+    aba_handle(path)

+ 676 - 0
CHARLS_P/CHARLS_preprocess_main.py

@@ -0,0 +1,676 @@
+import pandas as pd
+import numpy as np
+import pyreadstat
+
+#统一列名
+def change_columns(df):
+    df.columns = ["ID",'householdID','communityID','sex', "birth_year", "marital_status" , 'province', 'city',"Height", "Weight",
+                  "Systolic","Diastolic",
+
+                  'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
+                  'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
+
+                  'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
+                  'Liver_Disease', 'Heart_Problems', 'Stroke', ' Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
+                  'Emotional_Nervous_or_Psychiatric_Problems', ' Memory_Related_Disease',' Arthritis_or_Rheumatism','Asthma',
+                  
+                  'Vigorous_Activities', 'Moderate_Physical_Effort','Walking','Vigorous_Activities_day', 'Moderate_Physical_Effort_day',
+                  'Walking_day','Vigorous_Activities_2h', 'Moderate_Physical_Effort_2h','Walking_2h','Vigorous_Activities_30m', 
+                  'Moderate_Physical_Effort_30m','Walking_30m','Vigorous_Activities_4h', 'Moderate_Physical_Effort_4h','Walking_4h',
+                  
+                  'Smoke', 'Smoke_still','Number_Cigarettes','Drink',
+                  
+                  "Cognition_score", "Psychiatric_score", "wave"
+                  ]
+# 2020年把帕金森和记忆病症分开,需要和以前对齐   
+def process_row(row):
+    da002_12_ = row['da003_12_']
+    da002_13_ = row['da003_13_']
+    
+    if da002_12_ == 1 or da002_13_ == 1:
+        return 1
+    elif da002_12_ == 2 and da002_13_ == 2:
+        return 2
+    elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
+        return 2
+    elif pd.isna(da002_12_) and pd.isna(da002_13_):
+        return np.nan
+    else:
+        return np.nan  # 预防万一,其余情况下设为NA
+    
+def update_da051(value):
+    if value == 1:
+        return 3
+    elif value == 3:
+        return 1
+    else:
+        return value
+    
+if __name__ == "__main__":
+    # 2011年
+    year = "2011"
+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
+    blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
+    exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
+
+    #性别#年龄#居住地#婚姻状况
+    # 1 Married with spouse present
+    # 2 Married but not living with spouse temporarily for reasons such as work
+    # 3 Separated
+    # 4 Divorced
+    # 5 Widowed
+    # 6 Never married
+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','be001']]
+
+    #居住地
+    data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
+
+    #身高#体重#收缩压#舒张压
+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
+    data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
+
+    #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+    blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
+    data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
+    # 慢性病:
+    # (1)  Hypertension 高血压病    
+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
+    # (除脂肪肝、肿瘤或癌外)
+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
+    # (8)	 Stroke  中风
+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
+    # (12)	 Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
+    # (14)  Asthma  哮喘
+    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
+                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
+                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
+                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
+                                   ,'da069']]
+    
+    data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
+
+    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
+
+    #词语记忆
+    health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
+    health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+    #画图
+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
+
+    data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
+        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
+        health_status["draw_score"]
+    #心理得分
+    health_status["dc009_score"] = health_status["dc009"]-1
+    health_status["dc010_score"] = health_status["dc010"]-1
+    health_status["dc011_score"] = health_status["dc011"]-1
+    health_status["dc012_score"] = health_status["dc012"]-1   
+    health_status["dc013_score"] = 4 - health_status["dc013"] 
+    health_status["dc014_score"] = health_status["dc014"]-1   
+    health_status["dc015_score"] = health_status["dc015"]-1   
+    health_status["dc016_score"] = 4 - health_status["dc016"]
+    health_status["dc017_score"] = health_status["dc017"]-1   
+    health_status["dc018_score"] = health_status["dc018"]-1 
+    data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
+    data_2011["wave"] = year
+    change_columns(data_2011)
+
+    # 2013年
+    year = "2013"
+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
+    exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
+
+    #性别#年龄#婚姻状况
+    # 1 Married with spouse present
+    # 2 Married but not living with spouse temporarily for reasons such as work
+    # 3 Separated
+    # 4 Divorced
+    # 5 Widowed
+    # 6 Never married
+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','zba002_1','be001']]
+    #居住地
+    data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
+
+    #身高#体重#收缩压#舒张压
+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
+    data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
+
+    #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+    data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
+    
+    # 慢性病:
+    # (1)  Hypertension 高血压病    
+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
+    # (除脂肪肝、肿瘤或癌外)
+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
+    # (8)	 Stroke  中风
+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
+    # (12)	 Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
+    # (14)  Asthma  哮喘
+    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
+                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
+                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
+                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
+                                   ,'da069']]
+    
+    data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
+
+    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
+
+    #词语记忆
+    health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
+    health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+    #画图
+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
+
+    data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
+        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
+        health_status["draw_score"]
+    #心理得分
+    health_status["dc009_score"] = health_status["dc009"]-1
+    health_status["dc010_score"] = health_status["dc010"]-1
+    health_status["dc011_score"] = health_status["dc011"]-1
+    health_status["dc012_score"] = health_status["dc012"]-1   
+    health_status["dc013_score"] = 4 - health_status["dc013"] 
+    health_status["dc014_score"] = health_status["dc014"]-1   
+    health_status["dc015_score"] = health_status["dc015"]-1   
+    health_status["dc016_score"] = 4 - health_status["dc016"]
+    health_status["dc017_score"] = health_status["dc017"]-1   
+    health_status["dc018_score"] = health_status["dc018"]-1 
+    data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
+    data_2013["wave"] = year
+    change_columns(data_2013)
+    data_2013 = pd.concat([data_2011, data_2013], axis=0)
+
+    # 2015年
+    year = "2015"
+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
+    blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
+
+    #性别#年龄#婚姻状况
+    # 1 Married with spouse present
+    # 2 Married but not living with spouse temporarily for reasons such as work
+    # 3 Separated
+    # 4 Divorced
+    # 5 Widowed
+    # 6 Never married
+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
+    # 处理出生年的问题
+    data_2015['ba004_w3_1'] = demo.apply(lambda row: row['ba002_1'] if row['ba002'] == 2 else row['ba004_w3_1'], axis=1)
+
+    #居住地
+    data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
+
+    #身高#体重#收缩压#舒张压
+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'qa011','qa012']]
+    data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
+
+    #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+    blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
+    data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
+    
+    # 慢性病:
+    # (1)  Hypertension 高血压病    
+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
+    # (除脂肪肝、肿瘤或癌外)
+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
+    # (8)	 Stroke  中风
+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
+    # (12)	 Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
+    # (14)  Asthma  哮喘
+    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
+                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
+                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
+                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
+                                   ,'da069']]
+    
+    data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
+
+    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
+
+    #词语记忆
+    health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
+    health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+    #画图
+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
+
+    data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
+        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
+        health_status["draw_score"]
+    #心理得分
+    health_status["dc009_score"] = health_status["dc009"]-1
+    health_status["dc010_score"] = health_status["dc010"]-1
+    health_status["dc011_score"] = health_status["dc011"]-1
+    health_status["dc012_score"] = health_status["dc012"]-1   
+    health_status["dc013_score"] = 4 - health_status["dc013"] 
+    health_status["dc014_score"] = health_status["dc014"]-1   
+    health_status["dc015_score"] = health_status["dc015"]-1   
+    health_status["dc016_score"] = 4 - health_status["dc016"]
+    health_status["dc017_score"] = health_status["dc017"]-1   
+    health_status["dc018_score"] = health_status["dc018"]-1 
+    data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
+    data_2015["wave"] = year
+    change_columns(data_2015)
+    data_2015 = pd.concat([data_2013, data_2015], axis=0)
+
+    # 2018年
+    year = "2018"
+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
+    cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
+
+    #性别#年龄#婚姻状况
+    # 1 Married with spouse present
+    # 2 Married but not living with spouse temporarily for reasons such as work
+    # 3 Separated
+    # 4 Divorced
+    # 5 Widowed
+    # 6 Never married
+    data_2018 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
+    #居住地
+    data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
+
+    #身高#体重#收缩压#舒张压
+    data_2018[['qi002', 'ql002', 'qa011','qa012']]=np.nan
+
+    #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+    data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
+    
+    # 慢性病:
+    # (1)  Hypertension 高血压病    
+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
+    # (除脂肪肝、肿瘤或癌外)
+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
+    # (8)	 Stroke  中风
+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
+    # (12)	 Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
+    # (14)  Asthma  哮喘
+    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
+                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
+                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
+                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
+                                   ,'da069']]
+
+    data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
+
+    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    cognition["dc019_score"] = cognition["dc014_w4_1_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
+    cognition["dc020_score"] = cognition["dc014_w4_2_1"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
+    cognition["dc021_score"] = cognition["dc014_w4_3_1"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
+    cognition["dc022_score"] = cognition["dc014_w4_4_1"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
+    cognition["dc023_score"] = cognition["dc014_w4_5_1"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
+
+    #词语记忆
+    cognition["dc006s1_score"] = cognition["dc028_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    cognition["dc006s2_score"] = cognition["dc028_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    cognition["dc006s3_score"] = cognition["dc028_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    cognition["dc006s4_score"] = cognition["dc028_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    cognition["dc006s5_score"] = cognition["dc028_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    cognition["dc006s6_score"] = cognition["dc028_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    cognition["dc006s7_score"] = cognition["dc028_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    cognition["dc006s8_score"] = cognition["dc028_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    cognition["dc006s9_score"] = cognition["dc028_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    cognition["dc006s10_score"] = cognition["dc028_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
+    cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s1_score"] = cognition["dc047_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s2_score"] = cognition["dc047_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s3_score"] = cognition["dc047_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s4_score"] = cognition["dc047_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s5_score"] = cognition["dc047_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s6_score"] = cognition["dc047_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    cognition["dc027s7_score"] = cognition["dc047_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s8_score"] = cognition["dc047_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    cognition["dc027s9_score"] = cognition["dc047_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    cognition["dc027s10_score"] = cognition["dc047_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
+    cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+    #画图
+    cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0)
+
+    data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
+        cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
+        cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
+        cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
+        cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
+        cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
+        cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
+        cognition["dc006s11_score"] + cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
+        cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
+        cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
+        cognition["dc027s9_score"]+cognition["dc027s10_score"]+cognition["dc027s11_score"]+\
+        cognition["draw_score"]
+    #心理得分
+    cognition["dc009_score"] = cognition["dc009"]-1
+    cognition["dc010_score"] = cognition["dc010"]-1
+    cognition["dc011_score"] = cognition["dc011"]-1
+    cognition["dc012_score"] = cognition["dc012"]-1   
+    cognition["dc013_score"] = 4 - cognition["dc013"] 
+    cognition["dc014_score"] = cognition["dc014"]-1   
+    cognition["dc015_score"] = cognition["dc015"]-1   
+    cognition["dc016_score"] = 4 - cognition["dc016"]
+    cognition["dc017_score"] = cognition["dc017"]-1   
+    cognition["dc018_score"] = cognition["dc018"]-1 
+    data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
+        cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
+        cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
+    data_2018["wave"] = year
+    change_columns(data_2018)
+    data_2018 = pd.concat([data_2015, data_2018], axis=0)
+
+    # 2020年
+    year = "2020"
+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
+
+    #性别#年龄#婚姻状况
+    # 1 Married with spouse present
+    # 2 Married but not living with spouse temporarily for reasons such as work
+    # 3 Separated
+    # 4 Divorced
+    # 5 Widowed
+    # 6 Never married
+    data_2020 = demo[['ID','householdID', 'communityID','ba001', 'ba003_1','ba011']]
+    #居住地
+    data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
+
+    #身高#体重#收缩压#舒张压
+    data_2020[['qi002', 'ql002', 'qa011','qa012', 'qa013']]=np.nan
+
+    #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
+    data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
+    
+    # 慢性病:
+    # (1)  Hypertension 高血压病    
+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
+    # (除脂肪肝、肿瘤或癌外)
+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
+    # (8)	 Stroke  中风
+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
+    # (12)	 Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
+    # (14)  Asthma  哮喘
+    # 2020年把帕金森和记忆病症分开,需要和以前对齐
+    health_status['da003_12_'] = health_status.apply(process_row, axis=1)
+    health_status_select = health_status[['ID','householdID', 'communityID', 'da003_1_', 'da003_2_','da003_3_'
+                                   ,'da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
+                                   ,'da003_12_','da003_14_','da003_15_','da032_1_','da032_2_', 'da032_3_'
+                                   ,'da033_1_','da033_2_','da033_3_','da034_1_','da034_2_','da034_3_','da035_1_','da035_2_','da035_3_'
+                                    ,'da036_1_','da036_2_','da036_3_', 'da046','da047','da050_1'
+                                   ,'da051']]
+    health_status_select['da051'] = health_status_select['da051'].apply(update_da051)
+    
+    data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
+
+    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc019_score"] = health_status["dc007_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
+    health_status["dc020_score"] = health_status["dc007_2"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
+    health_status["dc021_score"] = health_status["dc007_3"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
+    health_status["dc022_score"] = health_status["dc007_4"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
+    health_status["dc023_score"] = health_status["dc007_5"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
+
+    #词语记忆
+    health_status["dc006s1_score"] = health_status["dc012_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    health_status["dc006s2_score"] = health_status["dc012_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
+    health_status["dc006s3_score"] = health_status["dc012_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
+    health_status["dc006s4_score"] = health_status["dc012_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s5_score"] = health_status["dc012_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s6_score"] = health_status["dc012_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s7_score"] = health_status["dc012_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s8_score"] = health_status["dc012_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc006s9_score"] = health_status["dc012_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc006s10_score"] = health_status["dc012_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
+    health_status["dc006s11_score"] = health_status["dc012_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s1_score"] = health_status["dc028_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s2_score"] = health_status["dc028_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s3_score"] = health_status["dc028_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s4_score"] = health_status["dc028_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s5_score"] = health_status["dc028_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s6_score"] = health_status["dc028_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s7_score"] = health_status["dc028_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s8_score"] = health_status["dc028_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
+    health_status["dc027s9_score"] = health_status["dc028_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s10_score"] = health_status["dc028_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
+    health_status["dc027s11_score"] = health_status["dc028_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+    #画图
+    health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0)
+
+    data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
+        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
+        health_status["draw_score"]
+    #心理得分
+    health_status["dc009_score"] = health_status["dc016"]-1
+    health_status["dc010_score"] = health_status["dc017"]-1
+    health_status["dc011_score"] = health_status["dc018"]-1
+    health_status["dc012_score"] = health_status["dc019"]-1   
+    health_status["dc013_score"] = 4 - health_status["dc020"] 
+    health_status["dc014_score"] = health_status["dc021"]-1   
+    health_status["dc015_score"] = health_status["dc022"]-1   
+    health_status["dc016_score"] = 4 - health_status["dc023"]
+    health_status["dc017_score"] = health_status["dc024"]-1   
+    health_status["dc018_score"] = health_status["dc025"]-1 
+    data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
+    data_2020["wave"] = year
+    change_columns(data_2020)
+    data_2020 = pd.concat([data_2018, data_2020], axis=0)
+
+    #修改地区名称
+    #省份、城市名称和污染物数据格式对齐
+    #海东地区->海东市
+    data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
+    #北京 -> 北京市
+    data_2020['city'] = data_2020['city'].replace('北京', '北京市')
+    data_2020['province'] = data_2020['province'].replace('北京', '北京市')
+    #哈尔滨 -> 哈尔滨市
+    data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
+    #天津 -> 天津市
+    data_2020['city'] = data_2020['city'].replace('天津', '天津市')
+    data_2020['province'] = data_2020['province'].replace('天津', '天津市')
+    #广西省 -> 广西壮族自治区
+    data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
+    #巢湖市 -> 合肥市
+    data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
+    #襄樊市->襄阳市
+    data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') 
+    data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
+    print(123)

+ 184 - 3
CLHLS_P/CLHLS_process.py

@@ -52,7 +52,7 @@ def get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, colu
     # 计算总合
     result['mmse_'+cognitive_name] = result["general_cognitive_"+cognitive_name] + result["reaction_"+cognitive_name]+ result["attention_calculation_"+cognitive_name]+ result["memory_"+cognitive_name]+ result["language_selfcoordination_"+cognitive_name]
 
-if __name__ == "__main__":
+def deal_1998_2018_data():
     sav_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.sav"
     csv_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.csv"
     # 将sav数据转为csv
@@ -200,8 +200,8 @@ if __name__ == "__main__":
                     "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14",  "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
                     "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18",  "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
     trans_mmse(columns_mmse, data)
-    columns_mmse_c16 = ["c16", "c16_0", "c16_2", "c16_5", "c16_8", "c16_11", "c16_14", "c16_8"]
-    trans_mmse_c16(columns_mmse, data)
+    columns_mmse_c16 = ["c16", "c16_0", "c16_2", "c16_5", "c16_8", "c16_11", "c16_14", "c16_18"]
+    trans_mmse_c16(columns_mmse_c16, data)
     columns_cognitive_98 = ["c11", "c12", "c13", "c14", "c15", "c16"]
     columns_reaction_98 = ["c21a", "c21b", "c21c"]
     columns_attention_98 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
@@ -250,8 +250,189 @@ if __name__ == "__main__":
     columns_memory_18 = ["c41a_18",  "c41b_18", "c41c_18"]
     columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
     get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
+
+    # #血液指标
+    # columns_to_exclude = ['midn', 'trueage', 'a1']  # 替换为你要排除的列名
+    # #2008
+    # biomarker_08 = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008-1.tab", sep='\t')
+    # biomarker_08 = biomarker_08.drop(columns=columns_to_exclude)
+    # columns_bio = ["id"]
+    # for col in biomarker_08.columns:
+    #     if not col == "id":
+    #         columns_bio.append(col+"_08")
+    # biomarker_08.columns = columns_bio
+    # result = pd.merge(result, biomarker_08, on = ["id"], how="left")
     print(result.head())
     result.to_csv("CLHLS/clhls_1998_2018_result.csv", index=False)
+
+def deal_2008_2018_data():
+    sav_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.sav"
+    csv_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.csv"
+    # 将sav数据转为csv
+    # sav2csv(sav_file_path, csv_file_path)
+    #处理数据
+    data = pd.read_csv(csv_file_path)
+    # 存活状态0存活;1死亡;-9失访;-8死亡/失访
+    result = data[['id', 'dth08_11', 'dth11_14', 'dth14_18']]
+    # 人口特征学变量
+    # 8/9代表无法回答和缺失
+    # 年龄
+    result[['trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_11', 'trueage_14', 'trueage_18']]
+    # 性别 1男;0女
+    result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
+    # 民族 1汉族;0非汉族
+    result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
+    # 出生地 1城市;0农村
+    result['birth_place'] = data['a43'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
+    # 教育状况 无11年
+    result['edu_08'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
+    # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
+    result['marital_08'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
+    result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
+    # 生活是否富裕 1富裕及以上;0一般及以下
+    result['econ_state_08'] = data['f34'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
+    result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
+    # 上一年家庭收入 99998超过10万
+    result['income_08'] = data['f35'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
+    result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
+    # 居住状态 1与家庭成员同住;2独居;3在机构居住
+    result['co_residence_08'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
+    result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
+    result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
+    # 目前是否吸烟 1是;2否
+    result['smoke_08'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
+    result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
+    result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
+    # 目前是否饮酒 1是;2否
+    result['drink_08'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
+    result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
+    result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
+    result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 目前是否锻炼
+    result['exercise_08'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
+    result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
+    result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad; 
+    result['self_reported_helth_08'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
+    result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
+    result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
+    # 慢性病
+    result['chronic_08'] = data['g14a1'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
+    result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
+    result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
+    result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
+    
+    # 抑郁量表得分-only 18年  0无抑郁症;1有抑郁症
+    # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
+    # 定义转换规则
+    transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
+    # 应用转换规则
+    columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
+    for column_one in columns_cesd_one:
+        data[column_one] = data[column_one].map(transformation_one).fillna(np.nan)  # 将8转换为缺失值
+    # 定义转换规则
+    transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
+    # 应用转换规则
+    columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
+    for column_two in columns_cesd_two:
+        data[column_two] = data[column_two].map(transformation_two).fillna(np.nan)  # 将8转换为缺失值
+    result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18'] 
+    result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
+    
+    # 日常生活活动能力 0无残疾;1有残疾
+    # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
+    columns_adl_08 = ['e1', 'e2', 'e3', 'e4', 'e5', 'e6']
+    trans_adl(columns_adl_08, data, result, "adl_08")
+    columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
+    trans_adl(columns_adl_11, data, result, "adl_11")
+    columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
+    trans_adl(columns_adl_14, data, result, "adl_14")
+    columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
+    trans_adl(columns_adl_18, data, result, "adl_18")
+    # 认知功能 0有认知功能障碍;1认知功能正常
+    # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
+    # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
+    columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a",  "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
+                    "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11",  "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
+                    "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14",  "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
+                    "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18",  "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
+    trans_mmse(columns_mmse, data)
+    columns_mmse_c16 = ["c16", "c16_11", "c16_14", "c16_18"]
+    trans_mmse_c16(columns_mmse_c16, data)
+    columns_cognitive_08 = ["c11", "c12", "c13", "c14", "c15", "c16"]
+    columns_reaction_08 = ["c21a", "c21b", "c21c"]
+    columns_attention_08 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
+    columns_memory_08 = ["c41a",  "c41b", "c41c"]
+    columns_language_08 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
+    get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
+    columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
+    columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
+    columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
+    columns_memory_11 = ["c41a_11",  "c41b_11", "c41c_11"]
+    columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
+    get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
+    columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
+    columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
+    columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
+    columns_memory_14 = ["c41a_14",  "c41b_14", "c41c_14"]
+    columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
+    get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
+    columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
+    columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
+    columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
+    columns_memory_18 = ["c41a_18",  "c41b_18", "c41c_18"]
+    columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
+    get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
+
+    # #血液指标
+    #2008
+    columns_bio = ["id"]
+    biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008.tab", sep='\t')
+    biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdl"]]
+    for col in biomarker.columns:
+        if not col == "id":
+            columns_bio.append(col+"_08")
+    biomarker.columns = columns_bio
+    result = pd.merge(result, biomarker, on = ["id"], how="left")
+
+    #2012
+    columns_bio = ["id"]
+    biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2012.tab", sep='\t')
+    biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
+    for col in biomarker.columns:
+        if not col == "id":
+            columns_bio.append(col+"_12")
+    biomarker.columns = columns_bio
+    result = pd.merge(result, biomarker, on = ["id"], how="left")
+
+    #2014
+    columns_bio = ["id"]
+    biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2014.tab", sep='\t')
+    biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
+    for col in biomarker.columns:
+        if not col == "id":
+            columns_bio.append(col+"_14")
+    biomarker.columns = columns_bio
+    result = pd.merge(result, biomarker, on = ["id"], how="left")
+
+    print(result.head())
+    result.to_csv("CLHLS/clhls_2008_2018_result.csv", index=False)
+
+if __name__ == "__main__":
+    # deal_1998_2018_data()
+    deal_2008_2018_data()
     print(123)