2 months ago · ac61a6b014
--- a/CHARLS_P/CHARLS_PM.py
+++ b/CHARLS_P/CHARLS_PM.py
@@ -5,7 +5,7 @@ import os
 
															 def pollutant_handle(path):
														
 
															     years = [2011, 2013,2015, 2018, 2020]
														
 
															     #读取污染物数据
														
 
															-    pollutants_data = pd.read_csv("pollution/result_O3_p.csv")
														
 
															+    pollutants_data = pd.read_csv("pollution/result_pm10_1km_p.csv")
														
 
															     for year in years:
														
 
															         CHARLS_data = pd.read_csv(path)
														
 
															         print(CHARLS_data.info())
														
@@ -14,8 +14,8 @@ def pollutant_handle(path):
 
															         #两个表合并
														
 
															         table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
														
 
															         #更新CHARLS表
														
 
															-        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
														
 
															-        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_pm10'] = table_merge[str(year-1)].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_pm10'] = table_merge[str(year-2)].values
														
 
															         CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
														
 
															         print(year)
														
--- a/CHARLS_P/CHARLS_meteorology.py
+++ b/CHARLS_P/CHARLS_meteorology.py
@@ -0,0 +1,125 @@
 
															+import pandas as pd
														
 
															+
														
 
															+def sunlight(CHARLS_data):
														
 
															+    years = [2011, 2013,2015, 2018, 2020]
														
 
															+    for year in years:
														
 
															+        #读取日照数据
														
 
															+        sunlight_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year-1}年】逐年日照.xlsx")
														
 
															+        sunlight_data_last = sunlight_data_last[["城市", "累积日照"]]
														
 
															+        sunlight_data_last = sunlight_data_last.rename(columns={"累积日照":"last_sunlight"})
														
 
															+        sunlight_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year-2}年】逐年日照.xlsx")
														
 
															+        sunlight_data_before_last = sunlight_data_before_last[["城市", "累积日照"]]
														
 
															+        sunlight_data_before_last = sunlight_data_before_last.rename(columns={"累积日照":"before_sunlight"})
														
 
															+        #开始筛选出year的数据
														
 
															+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
														
 
															+        #合并日照
														
 
															+        table_merge = pd.merge(CHARLS_data_year, sunlight_data_last, left_on="city", right_on="城市", how='left')
														
 
															+        table_merge = pd.merge(table_merge, sunlight_data_before_last, left_on="city", right_on="城市", how='left')
														
 
															+        #更新CHARLS表
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_sunlight'] = table_merge['last_sunlight'].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_sunlight'] = table_merge['before_sunlight'].values
														
 
															+        CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+        print(year)
														
 
															+
														
 
															+def wind(CHARLS_data):
														
 
															+    years = [2011, 2013,2015, 2018, 2020]
														
 
															+    for year in years:
														
 
															+        #读取日照数据
														
 
															+        wind_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year-1}年】逐年风速.xlsx")
														
 
															+        wind_data_last = wind_data_last[["城市", "平均风速"]]
														
 
															+        wind_data_last = wind_data_last.rename(columns={"平均风速":"last_wind"})
														
 
															+        wind_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year-2}年】逐年风速.xlsx")
														
 
															+        wind_data_before_last = wind_data_before_last[["城市", "平均风速"]]
														
 
															+        wind_data_before_last = wind_data_before_last.rename(columns={"平均风速":"before_wind"})
														
 
															+        #开始筛选出year的数据
														
 
															+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
														
 
															+        #合并日照
														
 
															+        table_merge = pd.merge(CHARLS_data_year, wind_data_last, left_on="city", right_on="城市", how='left')
														
 
															+        table_merge = pd.merge(table_merge, wind_data_before_last, left_on="city", right_on="城市", how='left')
														
 
															+        #更新CHARLS表
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_wind'] = table_merge['last_wind'].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_wind'] = table_merge['before_wind'].values
														
 
															+        CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+        print(year)
														
 
															+
														
 
															+def rain(CHARLS_data):
														
 
															+    years = [2011, 2013,2015, 2018, 2020]
														
 
															+    for year in years:
														
 
															+        #读取日照数据
														
 
															+        rain_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year-1}年】逐年降水.xlsx")
														
 
															+        rain_data_last = rain_data_last[["城市", "累积降水"]]
														
 
															+        rain_data_last = rain_data_last.rename(columns={"累积降水":"last_rain"})
														
 
															+        rain_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year-2}年】逐年降水.xlsx")
														
 
															+        rain_data_before_last = rain_data_before_last[["城市", "累积降水"]]
														
 
															+        rain_data_before_last = rain_data_before_last.rename(columns={"累积降水":"before_rain"})
														
 
															+        #开始筛选出year的数据
														
 
															+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
														
 
															+        #合并日照
														
 
															+        table_merge = pd.merge(CHARLS_data_year, rain_data_last, left_on="city", right_on="城市", how='left')
														
 
															+        table_merge = pd.merge(table_merge, rain_data_before_last, left_on="city", right_on="城市", how='left')
														
 
															+        #更新CHARLS表
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_rain'] = table_merge['last_rain'].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_rain'] = table_merge['before_rain'].values
														
 
															+        CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+        print(year)
														
 
															+
														
 
															+def temperature(CHARLS_data):
														
 
															+    years = [2011, 2013,2015, 2018, 2020]
														
 
															+    for year in years:
														
 
															+        #读取日照数据
														
 
															+        temperature_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year-1}年】逐年气温.xlsx")
														
 
															+        temperature_data_last = temperature_data_last[["城市", "平均气温"]]
														
 
															+        temperature_data_last = temperature_data_last.rename(columns={"平均气温":"last_temperature"})
														
 
															+        temperature_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year-2}年】逐年气温.xlsx")
														
 
															+        temperature_data_before_last = temperature_data_before_last[["城市", "平均气温"]]
														
 
															+        temperature_data_before_last = temperature_data_before_last.rename(columns={"平均气温":"before_temperature"})
														
 
															+        #开始筛选出year的数据
														
 
															+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
														
 
															+        #合并日照
														
 
															+        table_merge = pd.merge(CHARLS_data_year, temperature_data_last, left_on="city", right_on="城市", how='left')
														
 
															+        table_merge = pd.merge(table_merge, temperature_data_before_last, left_on="city", right_on="城市", how='left')
														
 
															+        #更新CHARLS表
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_temperature'] = table_merge['last_temperature'].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_temperature'] = table_merge['before_temperature'].values
														
 
															+        CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+        print(year)
														
 
															+
														
 
															+def humidity(CHARLS_data):
														
 
															+    years = [2011, 2013,2015, 2018, 2020]
														
 
															+    for year in years:
														
 
															+        #读取日照数据
														
 
															+        humidity_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year-1}年】逐年湿度.xlsx")
														
 
															+        humidity_data_last = humidity_data_last[["城市", "平均湿度"]]
														
 
															+        humidity_data_last = humidity_data_last.rename(columns={"平均湿度":"last_humidity"})
														
 
															+        humidity_data_before_last = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year-2}年】逐年湿度.xlsx")
														
 
															+        humidity_data_before_last = humidity_data_before_last[["城市", "平均湿度"]]
														
 
															+        humidity_data_before_last = humidity_data_before_last.rename(columns={"平均湿度":"before_humidity"})
														
 
															+        #开始筛选出year的数据
														
 
															+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
														
 
															+        #合并日照
														
 
															+        table_merge = pd.merge(CHARLS_data_year, humidity_data_last, left_on="city", right_on="城市", how='left')
														
 
															+        table_merge = pd.merge(table_merge, humidity_data_before_last, left_on="city", right_on="城市", how='left')
														
 
															+        #更新CHARLS表
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_humidity'] = table_merge['last_humidity'].values
														
 
															+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_humidity'] = table_merge['before_humidity'].values
														
 
															+        CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+        print(year)
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    #读取CHARLS数据
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
														
 
															+    CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
														
 
															+    sunlight(CHARLS_data)
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
														
 
															+    wind(CHARLS_data)
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
														
 
															+    rain(CHARLS_data)
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
														
 
															+    temperature(CHARLS_data)
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
														
 
															+    humidity(CHARLS_data)
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/CHARLS_P/CHARLS_preprocess.py
+++ b/CHARLS_P/CHARLS_preprocess.py
@@ -1,11 +0,0 @@
 
															-import pandas as pd
														
 
															-
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    path = "CHARLS_data_pollutants.csv"
														
 
															-    data = pd.read_csv(path, encoding="utf-8")
														
 
															-    print(data.info())
														
 
															-    data["born_year"] = data.groupby("ID")["born_year"].transform(lambda x : x.fillna(x.mean()))
														
 
															-    data["age"] = data["wave"] - data["born_year"]
														
 
															-    data.to_csv("CHARLS_data_pollutants_born.csv", encoding="utf-8")
														
--- a/CHARLS_P/CHARLS_preprocess.r
+++ b/CHARLS_P/CHARLS_preprocess.r
@@ -279,7 +279,7 @@ data <- merge(data, health_status_select, by = c('ID', 'householdID', 'community
 
															 if(year =="2018"){
														
 
															     health_status = Cognition
														
 
															 }
														
 
															-#计算认知功能得分，分成三部分：电话问卷10分，词语回忆10分、画图1分
														
 
															+#计算认知功能得分，分成三部分：电话问卷10分，词语回忆20分、画图1分
														
 
															 if(year == "2020"){
														
 
															     health_status$dc001s1_score <- ifelse(is.na(health_status$dc001), 0, ifelse(health_status$dc001 == 1, 1, 0))
														
 
															     health_status$dc001s2_score <- ifelse(is.na(health_status$dc005), 0, ifelse(health_status$dc005 == 2, 1, 0))
														
--- a/CHARLS_P/CHARLS_preprocess_main.py
+++ b/CHARLS_P/CHARLS_preprocess_main.py
@@ -4,23 +4,21 @@ import pyreadstat
 
															 #统一列名
														
 
															 def change_columns(df):
														
 
															-    df.columns = ["ID",'householdID','communityID','sex', "birth_year", "marital_status" , 'province', 'city',"Height", "Weight",
														
 
															-                  "Systolic","Diastolic",
														
 
															+    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
														
 
															+                  "waist", "Systolic","Diastolic",
														
 
															                   'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
														
 
															                   'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
														
 
															                   'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															-                  'Liver_Disease', 'Heart_Problems', 'Stroke', ' Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															-                  'Emotional_Nervous_or_Psychiatric_Problems', ' Memory_Related_Disease',' Arthritis_or_Rheumatism','Asthma',
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
														
 
															-                  'Vigorous_Activities', 'Moderate_Physical_Effort','Walking','Vigorous_Activities_day', 'Moderate_Physical_Effort_day',
														
 
															-                  'Walking_day','Vigorous_Activities_2h', 'Moderate_Physical_Effort_2h','Walking_2h','Vigorous_Activities_30m', 
														
 
															-                  'Moderate_Physical_Effort_30m','Walking_30m','Vigorous_Activities_4h', 'Moderate_Physical_Effort_4h','Walking_4h',
														
 
															+                  'Physical_activity',
														
 
															-                  'Smoke', 'Smoke_still','Number_Cigarettes','Drink',
														
 
															+                  'Smoke','Drink',
														
 
															-                  "Cognition_score", "Psychiatric_score", "wave"
														
 
															+                  "Cognition_score", "Psychiatric_score","sleep_state", "wave"
														
 
															                   ]
														
 
															 # 2020年把帕金森和记忆病症分开，需要和以前对齐   
														
 
															 def process_row(row):
														
@@ -58,19 +56,34 @@ if __name__ == "__main__":
 
															     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
														
 
															     #性别#年龄#居住地#婚姻状况
														
 
															-    # 1 Married with spouse present
														
 
															-    # 2 Married but not living with spouse temporarily for reasons such as work
														
 
															-    # 3 Separated
														
 
															-    # 4 Divorced
														
 
															-    # 5 Widowed
														
 
															-    # 6 Never married
														
 
															-    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','be001']]
														
 
															+    # 1 married or partnered
														
 
															+    # 0 other marital status (separated, divorced, unmarried, or widowed)
														
 
															+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
														
 
															+    
														
 
															+    #教育
														
 
															+    # 0 below high school
														
 
															+    # 1 high school
														
 
															+    # 2 college or above
														
 
															+    demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+
														
 
															+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
 
															     #身高#体重#收缩压#舒张压
														
 
															-    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
														
 
															+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    #腰围
														
 
															+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    #血压测量后两次的平均
														
 
															+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
														
 
															+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
														
 
															+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic']]
														
 
															     data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
														
 
															     #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
														
@@ -93,16 +106,35 @@ if __name__ == "__main__":
 
															     # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
														
 
															     # (13)	 Arthritis or rheumatism 关节炎或风湿病
														
 
															     # (14)  Asthma  哮喘
														
 
															+
														
 
															+    # 体力活动
														
 
															+    # 2 vigorous (vigorous activity more than once a week)
														
 
															+    # 1 moderate (moderate activity more than once a week)
														
 
															+    # 0 inactive (the rest)
														
 
															+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
														
 
															+                                                             1 if x["da051_2_"]==1 else 
														
 
															+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
														
 
															+                                                             else np.nan ,axis=1)
														
 
															+    # 抽烟
														
 
															+    # 1 抽过烟
														
 
															+    # 0 没有抽过烟
														
 
															+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
														
 
															+
														
 
															+    # 喝酒
														
 
															+    # 1 喝过酒
														
 
															+    # 0 没有喝过酒
														
 
															+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
														
 
															+                                                 0 if x["da069"] == 1 else 
														
 
															+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
														
 
															+
														
 
															     health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
														
 
															                                    ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															-                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
														
 
															-                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
														
 
															-                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
														
 
															-                                   ,'da069']]
														
 
															+                                   ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"]]
														
 
															     data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
														
 
															-    #计算认知功能得分，分成三部分：电话问卷10分，词语回忆10分、画图1分
														
 
															+    
														
 
															+    #计算认知功能得分，分成三部分：电话问卷10分，词语回忆20分、画图1分
														
 
															     health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															     health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
														
 
															     health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
														
@@ -125,7 +157,7 @@ if __name__ == "__main__":
 
															     health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
														
 
															-    health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															+    # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
														
@@ -136,9 +168,9 @@ if __name__ == "__main__":
 
															     health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															     #画图
														
 
															-    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
														
 
															+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															     data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
														
 
															         health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
														
@@ -147,10 +179,10 @@ if __name__ == "__main__":
 
															         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
														
 
															         health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
														
 
															         health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
														
 
															-        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															         health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
														
 
															         health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
														
 
															-        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
														
 
															+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
														
 
															         health_status["draw_score"]
														
 
															     #心理得分
														
 
															     health_status["dc009_score"] = health_status["dc009"]-1
														
@@ -166,8 +198,17 @@ if __name__ == "__main__":
 
															     data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
														
 
															         health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
														
 
															         health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
														
 
															+    #睡眠状态
														
 
															+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
														
 
															+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
														
 
															+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															+    data_2011["sleep_state"] = health_status['dc015']
														
 
															+
														
 
															     data_2011["wave"] = year
														
 
															     change_columns(data_2011)
														
 
															+    # 2011年的ID和其他年份有一点区别，倒数第三位加0
														
 
															+    data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
														
 
															     # 2013年
														
 
															     year = "2013"
														
@@ -179,18 +220,70 @@ if __name__ == "__main__":
 
															     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															-    # 1 Married with spouse present
														
 
															-    # 2 Married but not living with spouse temporarily for reasons such as work
														
 
															-    # 3 Separated
														
 
															-    # 4 Divorced
														
 
															-    # 5 Widowed
														
 
															-    # 6 Never married
														
 
															-    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','zba002_1','be001']]
														
 
															+    # 1 married or partnered
														
 
															+    # 0 other marital status (separated, divorced, unmarried, or widowed)
														
 
															+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
														
 
															+
														
 
															+    #教育
														
 
															+    # 0 below high school
														
 
															+    # 1 high school
														
 
															+    # 2 college or above
														
 
															+
														
 
															+    # 纠正2011年统计错误的教育
														
 
															+    demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
														
 
															+    demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+    education_correct = demo[['ID',"education_correct"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
														
 
															+    # 删除多余的列
														
 
															+    data_2011 = data_2011.drop(columns=['education_correct'])
														
 
															+
														
 
															+    #更新2013的教育
														
 
															+    demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
														
 
															+    demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+    #合并2011年的教育
														
 
															+    eductaion_2011 = data_2011[['ID',"education"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
														
 
															+
														
 
															+    # 纠正2011年统计错误的出生年
														
 
															+    demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
														
 
															+    birth_year_2013 = demo[['ID',"birth_year"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
														
 
															+    # 删除多余的列
														
 
															+    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
														
 
															+    #合并2011年的出生年
														
 
															+    birth_year_2011 = data_2011[['ID',"birth_year"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
														
 
															+
														
 
															+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
														
 
															+
														
 
															     #居住地
														
 
															     data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
 
															     #身高#体重#收缩压#舒张压
														
 
															-    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
														
 
															+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    #腰围
														
 
															+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    #血压测量后两次的平均
														
 
															+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
														
 
															+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
														
 
															+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic']]
														
 
															     data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
														
 
															     #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
														
@@ -213,12 +306,51 @@ if __name__ == "__main__":
 
															     # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
														
 
															     # (13)	 Arthritis or rheumatism 关节炎或风湿病
														
 
															     # (14)  Asthma  哮喘
														
 
															-    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
														
 
															-                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															-                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
														
 
															-                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
														
 
															-                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
														
 
															-                                   ,'da069']]
														
 
															+
														
 
															+
														
 
															+    # 体力活动
														
 
															+    # 2 vigorous (vigorous activity more than once a week)
														
 
															+    # 1 moderate (moderate activity more than once a week)
														
 
															+    # 0 inactive (the rest)
														
 
															+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
														
 
															+                                                             1 if x["da051_2_"]==1 else 
														
 
															+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
														
 
															+                                                             else np.nan ,axis=1)
														
 
															+    
														
 
															+    # 抽烟
														
 
															+    # 1 抽过烟
														
 
															+    # 0 没有抽过烟
														
 
															+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
														
 
															+
														
 
															+    # 喝酒
														
 
															+    # 1 喝过酒
														
 
															+    # 0 没有喝过酒
														
 
															+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
														
 
															+                                                 0 if x["da069"] == 1 else 
														
 
															+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
														
 
															+    
														
 
															+    # 合并2011年的慢性病
														
 
															+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															+                                   ,'da007_12_','da007_13_','da007_14_']
														
 
															+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
														
 
															+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
														
 
															+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
														
 
															+    
														
 
															+    diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
														
 
															+
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    for col in columns_to_diseases_new:
														
 
															+        health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
														
 
															+
														
 
															+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
														
 
															     data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
														
@@ -245,7 +377,7 @@ if __name__ == "__main__":
 
															     health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
														
 
															-    health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															+    # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
														
@@ -256,9 +388,9 @@ if __name__ == "__main__":
 
															     health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															     #画图
														
 
															-    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
														
 
															+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															     data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
														
 
															         health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
														
@@ -267,10 +399,10 @@ if __name__ == "__main__":
 
															         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
														
 
															         health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
														
 
															         health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
														
 
															-        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															         health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
														
 
															         health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
														
 
															-        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
														
 
															+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
														
 
															         health_status["draw_score"]
														
 
															     #心理得分
														
 
															     health_status["dc009_score"] = health_status["dc009"]-1
														
@@ -286,6 +418,14 @@ if __name__ == "__main__":
 
															     data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
														
 
															         health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
														
 
															         health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
														
 
															+    
														
 
															+    #睡眠状态
														
 
															+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
														
 
															+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
														
 
															+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															+    data_2013["sleep_state"] = health_status['dc015']
														
 
															+    
														
 
															     data_2013["wave"] = year
														
 
															     change_columns(data_2013)
														
 
															     data_2013 = pd.concat([data_2011, data_2013], axis=0)
														
@@ -300,21 +440,46 @@ if __name__ == "__main__":
 
															     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															-    # 1 Married with spouse present
														
 
															-    # 2 Married but not living with spouse temporarily for reasons such as work
														
 
															-    # 3 Separated
														
 
															-    # 4 Divorced
														
 
															-    # 5 Widowed
														
 
															-    # 6 Never married
														
 
															-    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
														
 
															-    # 处理出生年的问题
														
 
															-    data_2015['ba004_w3_1'] = demo.apply(lambda row: row['ba002_1'] if row['ba002'] == 2 else row['ba004_w3_1'], axis=1)
														
 
															+    # 1 married or partnered
														
 
															+    # 0 other marital status (separated, divorced, unmarried, or widowed)
														
 
															+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
														
 
															+    
														
 
															+    #教育
														
 
															+    # 0 below high school
														
 
															+    # 1 high school
														
 
															+    # 2 college or above
														
 
															+    #更新2015的教育
														
 
															+    demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
														
 
															+    demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+    #合并2013年的教育
														
 
															+    eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
														
 
															+
														
 
															+    # 2015年的出生年
														
 
															+    demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
														
 
															+
														
 
															+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
 
															     #身高#体重#收缩压#舒张压
														
 
															-    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'qa011','qa012']]
														
 
															+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    #腰围
														
 
															+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
														
 
															+    #血压测量后两次的平均
														
 
															+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
														
 
															+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
														
 
															+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
														
 
															+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
														
 
															+    #身高#体重#收缩压#舒张压
														
 
															+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic']]
														
 
															     data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
														
 
															     #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
														
@@ -338,12 +503,50 @@ if __name__ == "__main__":
 
															     # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
														
 
															     # (13)	 Arthritis or rheumatism 关节炎或风湿病
														
 
															     # (14)  Asthma  哮喘
														
 
															-    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
														
 
															-                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															-                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
														
 
															-                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
														
 
															-                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
														
 
															-                                   ,'da069']]
														
 
															+
														
 
															+    # 体力活动
														
 
															+    # 2 vigorous (vigorous activity more than once a week)
														
 
															+    # 1 moderate (moderate activity more than once a week)
														
 
															+    # 0 inactive (the rest)
														
 
															+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
														
 
															+                                                             1 if x["da051_2_"]==1 else 
														
 
															+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
														
 
															+                                                             else np.nan ,axis=1)
														
 
															+    
														
 
															+    # 抽烟
														
 
															+    # 1 抽过烟
														
 
															+    # 0 没有抽过烟
														
 
															+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
														
 
															+
														
 
															+    # 喝酒
														
 
															+    # 1 喝过酒
														
 
															+    # 0 没有喝过酒
														
 
															+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
														
 
															+                                                 0 if x["da069"] == 1 else 
														
 
															+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
														
 
															+
														
 
															+    # 合并2013年的慢性病
														
 
															+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															+                                   ,'da007_12_','da007_13_','da007_14_']
														
 
															+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
														
 
															+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
														
 
															+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
														
 
															+    
														
 
															+    diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
														
 
															+
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    for col in columns_to_diseases_new:
														
 
															+        health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
														
 
															+
														
 
															+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
														
 
															     data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
														
@@ -370,7 +573,7 @@ if __name__ == "__main__":
 
															     health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
														
 
															-    health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															+    # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
														
@@ -381,9 +584,9 @@ if __name__ == "__main__":
 
															     health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															     health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															     health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															     #画图
														
 
															-    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
														
 
															+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															     data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
														
 
															         health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
														
@@ -392,10 +595,10 @@ if __name__ == "__main__":
 
															         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
														
 
															         health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
														
 
															         health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
														
 
															-        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															         health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
														
 
															         health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
														
 
															-        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
														
 
															+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
														
 
															         health_status["draw_score"]
														
 
															     #心理得分
														
 
															     health_status["dc009_score"] = health_status["dc009"]-1
														
@@ -411,6 +614,13 @@ if __name__ == "__main__":
 
															     data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
														
 
															         health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
														
 
															         health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
														
 
															+    #睡眠状态
														
 
															+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
														
 
															+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
														
 
															+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															+    data_2015["sleep_state"] = health_status['dc015']
														
 
															+    
														
 
															     data_2015["wave"] = year
														
 
															     change_columns(data_2015)
														
 
															     data_2015 = pd.concat([data_2013, data_2015], axis=0)
														
@@ -424,18 +634,28 @@ if __name__ == "__main__":
 
															     cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															-    # 1 Married with spouse present
														
 
															-    # 2 Married but not living with spouse temporarily for reasons such as work
														
 
															-    # 3 Separated
														
 
															-    # 4 Divorced
														
 
															-    # 5 Widowed
														
 
															-    # 6 Never married
														
 
															-    data_2018 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
														
 
															+    # 1 married or partnered
														
 
															+    # 0 other marital status (separated, divorced, unmarried, or widowed)
														
 
															+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
														
 
															+
														
 
															+    #教育
														
 
															+    # 0 below high school
														
 
															+    # 1 high school
														
 
															+    # 2 college or above
														
 
															+    #更新2015的教育
														
 
															+    demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
														
 
															+    demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+
														
 
															+    # 出生年
														
 
															+    demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
														
 
															+
														
 
															+    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
														
 
															+
														
 
															     #居住地
														
 
															     data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
 
															     #身高#体重#收缩压#舒张压
														
 
															-    data_2018[['qi002', 'ql002', 'qa011','qa012']]=np.nan
														
 
															+    data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
														
 
															     #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
														
 
															     #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
														
@@ -457,52 +677,89 @@ if __name__ == "__main__":
 
															     # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
														
 
															     # (13)	 Arthritis or rheumatism 关节炎或风湿病
														
 
															     # (14)  Asthma  哮喘
														
 
															-    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
														
 
															-                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															-                                   ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
														
 
															-                                   ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
														
 
															-                                   ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
														
 
															-                                   ,'da069']]
														
 
															+
														
 
															+    # 体力活动
														
 
															+    # 2 vigorous (vigorous activity more than once a week)
														
 
															+    # 1 moderate (moderate activity more than once a week)
														
 
															+    # 0 inactive (the rest)
														
 
															+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
														
 
															+                                                             1 if x["da051_2_"]==1 else 
														
 
															+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
														
 
															+                                                             else np.nan ,axis=1)
														
 
															+    
														
 
															+    # 抽烟
														
 
															+    # 1 抽过烟
														
 
															+    # 0 没有抽过烟
														
 
															+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
														
 
															+
														
 
															+    # 喝酒
														
 
															+    # 1 喝过酒
														
 
															+    # 0 没有喝过酒
														
 
															+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
														
 
															+                                                 0 if x["da069"] == 1 else 
														
 
															+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
														
 
															+    
														
 
															+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
														
 
															+                                   ,'da007_12_','da007_13_','da007_14_']
														
 
															+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
														
 
															+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
														
 
															+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
														
 
															+    
														
 
															+    diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
														
 
															+
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    for col in columns_to_diseases_new:
														
 
															+        health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
														
 
															+
														
 
															+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
														
 
															     data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
														
 
															     #计算认知功能得分，分成三部分：电话问卷10分，词语回忆10分、画图1分
														
 
															-    cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc019_score"] = cognition["dc014_w4_1_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc020_score"] = cognition["dc014_w4_2_1"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc021_score"] = cognition["dc014_w4_3_1"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc022_score"] = cognition["dc014_w4_4_1"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc023_score"] = cognition["dc014_w4_5_1"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
														
 
															+    cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+    cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+    cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+    cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+    cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+    cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
														
 
															+    cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
														
 
															+    cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
														
 
															+    cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
														
 
															+    cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
														
 
															     #词语记忆
														
 
															-    cognition["dc006s1_score"] = cognition["dc028_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc006s2_score"] = cognition["dc028_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc006s3_score"] = cognition["dc028_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
														
 
															-    cognition["dc006s4_score"] = cognition["dc028_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc006s5_score"] = cognition["dc028_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc006s6_score"] = cognition["dc028_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
														
 
															-    cognition["dc006s7_score"] = cognition["dc028_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc006s8_score"] = cognition["dc028_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc006s9_score"] = cognition["dc028_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															-    cognition["dc006s10_score"] = cognition["dc028_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
														
 
															-    cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s1_score"] = cognition["dc047_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s2_score"] = cognition["dc047_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s3_score"] = cognition["dc047_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s4_score"] = cognition["dc047_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s5_score"] = cognition["dc047_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s6_score"] = cognition["dc047_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
														
 
															-    cognition["dc027s7_score"] = cognition["dc047_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s8_score"] = cognition["dc047_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															-    cognition["dc027s9_score"] = cognition["dc047_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															-    cognition["dc027s10_score"] = cognition["dc047_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
														
 
															-    cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+    cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
														
 
															+    cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
														
 
															+    cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
														
 
															+    cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) 
														
 
															+    cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) 
														
 
															+    cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)                                            
														
 
															+    cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) 
														
 
															+    cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) 
														
 
															+    cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)                                            
														
 
															+    cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)                                           
														
 
															+    # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															+    cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) 
														
 
															+    cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) 
														
 
															+    cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) 
														
 
															+    cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) 
														
 
															+    cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) 
														
 
															+    cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)                                            
														
 
															+    cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) 
														
 
															+    cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) 
														
 
															+    cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)                                            
														
 
															+    cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)                                            
														
 
															+    # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															     #画图
														
 
															-    cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0)
														
 
															+    cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															     data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
														
 
															         cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
														
@@ -511,25 +768,32 @@ if __name__ == "__main__":
 
															         cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
														
 
															         cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
														
 
															         cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
														
 
															-        cognition["dc006s11_score"] + cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
														
 
															+        cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
														
 
															         cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
														
 
															         cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
														
 
															-        cognition["dc027s9_score"]+cognition["dc027s10_score"]+cognition["dc027s11_score"]+\
														
 
															+        cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
														
 
															         cognition["draw_score"]
														
 
															     #心理得分
														
 
															-    cognition["dc009_score"] = cognition["dc009"]-1
														
 
															-    cognition["dc010_score"] = cognition["dc010"]-1
														
 
															-    cognition["dc011_score"] = cognition["dc011"]-1
														
 
															-    cognition["dc012_score"] = cognition["dc012"]-1   
														
 
															-    cognition["dc013_score"] = 4 - cognition["dc013"] 
														
 
															-    cognition["dc014_score"] = cognition["dc014"]-1   
														
 
															-    cognition["dc015_score"] = cognition["dc015"]-1   
														
 
															-    cognition["dc016_score"] = 4 - cognition["dc016"]
														
 
															-    cognition["dc017_score"] = cognition["dc017"]-1   
														
 
															-    cognition["dc018_score"] = cognition["dc018"]-1 
														
 
															+    cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) 
														
 
															+    cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) 
														
 
															     data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
														
 
															         cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
														
 
															         cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
														
 
															+    #睡眠状态
														
 
															+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
														
 
															+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
														
 
															+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															+    data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) 
														
 
															+    
														
 
															     data_2018["wave"] = year
														
 
															     change_columns(data_2018)
														
 
															     data_2018 = pd.concat([data_2015, data_2018], axis=0)
														
@@ -541,18 +805,38 @@ if __name__ == "__main__":
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															-    # 1 Married with spouse present
														
 
															-    # 2 Married but not living with spouse temporarily for reasons such as work
														
 
															-    # 3 Separated
														
 
															-    # 4 Divorced
														
 
															-    # 5 Widowed
														
 
															-    # 6 Never married
														
 
															-    data_2020 = demo[['ID','householdID', 'communityID','ba001', 'ba003_1','ba011']]
														
 
															+    # 1 married or partnered
														
 
															+    # 0 other marital status (separated, divorced, unmarried, or widowed)
														
 
															+    demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
														
 
															+
														
 
															+    #教育
														
 
															+    # 0 below high school
														
 
															+    # 1 high school
														
 
															+    # 2 college or above
														
 
															+    demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
														
 
															+    demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+    #合并2018年的教育
														
 
															+    eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
														
 
															+
														
 
															+    # 出生年
														
 
															+    demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
														
 
															+    #合并2018年的出生年
														
 
															+    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
														
 
															+
														
 
															+    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
 
															     #身高#体重#收缩压#舒张压
														
 
															-    data_2020[['qi002', 'ql002', 'qa011','qa012', 'qa013']]=np.nan
														
 
															+    data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
														
 
															     #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
														
 
															     #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
														
@@ -575,81 +859,123 @@ if __name__ == "__main__":
 
															     # (13)	 Arthritis or rheumatism 关节炎或风湿病
														
 
															     # (14)  Asthma  哮喘
														
 
															     # 2020年把帕金森和记忆病症分开，需要和以前对齐
														
 
															+
														
 
															+    # 体力活动
														
 
															+    # 2 vigorous (vigorous activity more than once a week)
														
 
															+    # 1 moderate (moderate activity more than once a week)
														
 
															+    # 0 inactive (the rest)
														
 
															+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else 
														
 
															+                                                             1 if x["da032_2_"]==1 else 
														
 
															+                                                             0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2) 
														
 
															+                                                             else np.nan ,axis=1)
														
 
															+    
														
 
															+    # 抽烟
														
 
															+    # 1 抽过烟
														
 
															+    # 0 没有抽过烟
														
 
															+    health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
														
 
															+
														
 
															+    # 喝酒
														
 
															+    # 1 喝过酒
														
 
															+    # 0 没有喝过酒
														
 
															+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else 
														
 
															+                                                 0 if x["da051"] == 3 else np.nan, axis=1)
														
 
															+
														
 
															     health_status['da003_12_'] = health_status.apply(process_row, axis=1)
														
 
															-    health_status_select = health_status[['ID','householdID', 'communityID', 'da003_1_', 'da003_2_','da003_3_'
														
 
															-                                   ,'da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
														
 
															-                                   ,'da003_12_','da003_14_','da003_15_','da032_1_','da032_2_', 'da032_3_'
														
 
															-                                   ,'da033_1_','da033_2_','da033_3_','da034_1_','da034_2_','da034_3_','da035_1_','da035_2_','da035_3_'
														
 
															-                                    ,'da036_1_','da036_2_','da036_3_', 'da046','da047','da050_1'
														
 
															-                                   ,'da051']]
														
 
															-    health_status_select['da051'] = health_status_select['da051'].apply(update_da051)
														
 
															+
														
 
															+    columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
														
 
															+                                   ,'da003_12_','da003_14_','da003_15_']
														
 
															+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
														
 
															+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
														
 
															+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
														
 
															+    
														
 
															+    diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
														
 
															+
														
 
															+    # 按 'ID' 列合并两个表
														
 
															+    health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
														
 
															+    # 使用 fillna() 来更新字段
														
 
															+    for col in columns_to_diseases_new:
														
 
															+        health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
														
 
															+
														
 
															+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
														
 
															     data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
														
 
															     #计算认知功能得分，分成三部分：电话问卷10分，词语回忆10分、画图1分
														
 
															-    health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc019_score"] = health_status["dc007_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc020_score"] = health_status["dc007_2"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc021_score"] = health_status["dc007_3"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc022_score"] = health_status["dc007_4"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc023_score"] = health_status["dc007_5"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
														
 
															+    health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															+    health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															+    health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															+    health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															+    health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															+    health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1) 
														
 
															+    health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1) 
														
 
															+    health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
														
 
															+    health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
														
 
															+    health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
														
 
															     #词语记忆
														
 
															-    health_status["dc006s1_score"] = health_status["dc012_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc006s2_score"] = health_status["dc012_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc006s3_score"] = health_status["dc012_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
														
 
															-    health_status["dc006s4_score"] = health_status["dc012_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc006s5_score"] = health_status["dc012_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc006s6_score"] = health_status["dc012_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc006s7_score"] = health_status["dc012_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc006s8_score"] = health_status["dc012_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc006s9_score"] = health_status["dc012_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc006s10_score"] = health_status["dc012_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
														
 
															-    health_status["dc006s11_score"] = health_status["dc012_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s1_score"] = health_status["dc028_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s2_score"] = health_status["dc028_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s3_score"] = health_status["dc028_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s4_score"] = health_status["dc028_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s5_score"] = health_status["dc028_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s6_score"] = health_status["dc028_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s7_score"] = health_status["dc028_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s8_score"] = health_status["dc028_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
														
 
															-    health_status["dc027s9_score"] = health_status["dc028_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s10_score"] = health_status["dc028_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
														
 
															-    health_status["dc027s11_score"] = health_status["dc028_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+    health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
														
 
															+    health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
														
 
															+    health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
														
 
															+    health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1) 
														
 
															+    health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1) 
														
 
															+    health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)                                            
														
 
															+    health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1) 
														
 
															+    health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1) 
														
 
															+    health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)                                            
														
 
															+    health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)                                           
														
 
															+    health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1) 
														
 
															+    health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1) 
														
 
															+    health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1) 
														
 
															+    health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1) 
														
 
															+    health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1) 
														
 
															+    health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)                                            
														
 
															+    health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1) 
														
 
															+    health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1) 
														
 
															+    health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)                                            
														
 
															+    health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)                                            
														
 
															     #画图
														
 
															-    health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0)
														
 
															+    health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
														
 
															-    data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
														
 
															+    data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
														
 
															         health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
														
 
															         health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
														
 
															         health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
														
 
															         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
														
 
															         health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
														
 
															         health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
														
 
															-        health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
														
 
															         health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
														
 
															         health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
														
 
															-        health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
														
 
															+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
														
 
															         health_status["draw_score"]
														
 
															     #心理得分
														
 
															-    health_status["dc009_score"] = health_status["dc016"]-1
														
 
															-    health_status["dc010_score"] = health_status["dc017"]-1
														
 
															-    health_status["dc011_score"] = health_status["dc018"]-1
														
 
															-    health_status["dc012_score"] = health_status["dc019"]-1   
														
 
															-    health_status["dc013_score"] = 4 - health_status["dc020"] 
														
 
															-    health_status["dc014_score"] = health_status["dc021"]-1   
														
 
															-    health_status["dc015_score"] = health_status["dc022"]-1   
														
 
															-    health_status["dc016_score"] = 4 - health_status["dc023"]
														
 
															-    health_status["dc017_score"] = health_status["dc024"]-1   
														
 
															-    health_status["dc018_score"] = health_status["dc025"]-1 
														
 
															+    health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) 
														
 
															+    health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
														
 
															+    health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
														
 
															+    health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) 
														
 
															     data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
														
 
															         health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
														
 
															         health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
														
 
															+    
														
 
															+    #睡眠状态
														
 
															+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
														
 
															+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
														
 
															+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															+    data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) 
														
 
															+
														
 
															     data_2020["wave"] = year
														
 
															     change_columns(data_2020)
														
 
															     data_2020 = pd.concat([data_2018, data_2020], axis=0)
														
--- a/CHARLS_P/CHARLS_split.py
+++ b/CHARLS_P/CHARLS_split.py
@@ -0,0 +1,91 @@
 
															+import pandas as pd
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    path = "CHARLS_data_pollutants_mete.csv"
														
 
															+    data = pd.read_csv(path, encoding="utf-8")
														
 
															+    print(data.info())
														
 
															+    base_feature = ['ID', 'wave','rgender','birth_year','marital_status','education','Smoke', 'Drink']
														
 
															+    pollutant_feature = ['last_year_O3', 'before_last_O3',	'last_year_pm1', 'before_last_pm1','last_year_pm2.5','before_last_pm2.5',
														
 
															+                  'last_year_pm10',	'before_last_pm10',	
														
 
															+                  'last_year_SO4', 'last_year_NO3', 'last_year_NH4', 'last_year_OM', 'last_year_BC', 
														
 
															+                  'before_last_SO4', 'before_last_NO3',	'before_last_NH4', 'before_last_OM', 'before_last_BC']
														
 
															+    nl_feature = ['last_year_nl', 'before_last_nl']
														
 
															+    meteorology_features = ['last_year_sunlight', 'before_last_sunlight', 'last_year_wind', 'before_last_wind',
														
 
															+                            'last_year_rain', 'before_last_rain', 'last_year_temperature', 'before_last_temperature', 
														
 
															+                            'last_year_humidity', 'before_last_humidity']
														
 
															+    blood_features = ['bl_wbc', 'bl_mcv', 'bl_plt', 'bl_bun', 'bl_glu', 'bl_crea', 'bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl',
														
 
															+                      'bl_crp', 'bl_hbalc', 'bl_ua', 'bl_hct', 'bl_hgb', 'bl_cysc']
														
 
															+    disease_features = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
														
 
															+    #夜光暴露与空气污染对老年人认知功能的交互影响及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features+["Cognition_score", "Memory_Related_Disease"]]
														
 
															+    one_data.to_csv("2.csv", index=False)
														
 
															+
														
 
															+    #夜光暴露与空气污染及其交互作用与代谢综合征关联性研究
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature +["bl_glu",'bl_tg','bl_hdl', "Height", "Weight", "Systolic", "Diastolic", "Dyslipidemia", "Disabetes_or_High_Blood_Sugar"]]
														
 
															+    one_data.to_csv("3.csv", index=False)
														
 
															+
														
 
															+    # 4.夜光暴露与空气污染对心理健康（抑郁症状，生活满意度）的交互影响及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features +["Psychiatric_score", "Emotional_Nervous_or_Psychiatric_Problems"]]
														
 
															+    one_data.to_csv("4.csv", index=False)
														
 
															+
														
 
															+    # 5.夜光暴露与空气污染及其交互作用对多病共存的影响及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features + disease_features]
														
 
															+    one_data.to_csv("5.csv", index=False)
														
 
															+
														
 
															+    # 6.夜光暴露与空气污染对胰岛素抵抗（甘油三酯-葡萄糖指数）及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features]
														
 
															+    one_data.to_csv("6.csv", index=False)    
														
 
															+
														
 
															+    # 7.基于机器学习探究夜光暴露、应激性高血糖比和空气污染对泛血管疾病/心血管病的影响与预测作用        AI探索之间的关联性，预测效果优于传统方法
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + ["bl_glu", "bl_hbalc", "Heart_Problems"]]
														
 
															+    one_data.to_csv("7.csv", index=False)   
														
 
															+
														
 
															+    # 8.夜光暴露与慢性非传染性疾病（高血压、心脏病、中风、糖尿病、关节炎、癌症和记忆相关疾病）关联的因果效应
														
 
															+    one_data = data.loc[:, base_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("8.csv", index=False)  
														
 
															+   
														
 
															+    # 9.夜光暴露与空气污染对心脏代谢多发病，肾功能异常，血脂异常，痛风，高尿酸血症，代谢综合征等交互影响及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features + disease_features]
														
 
															+    one_data.to_csv("9.csv", index=False)
														
 
															+
														
 
															+    # 10.夜光暴露与空气污染累积联合暴露与疾病之间的关联性
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("10.csv", index=False)    
														
 
															+
														
 
															+    # 11.夜光暴露与空气污染对感染性疾病的交互影响及炎症和氧化应激的中介效应
														
 
															+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features + disease_features]
														
 
															+    one_data.to_csv("11.csv", index=False)    
														
 
															+
														
 
															+    # 12.基于等时替代模型的夜光暴露与空气污染与疾病的关联性及机制研究
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("12.csv", index=False)   
														
 
															+
														
 
															+    # 13.夜光暴露与空气污染对睡眠障碍的交互影响及炎症和氧化应激的中介效应
														
 
															+    
														
 
															+
														
 
															+    # 14.基于贝叶斯网络的夜光暴露、空气污染与肥胖相关指标（BMI、锥度指数、相对脂肪质量指数等）对疾病发病风险的预测作用
														
 
															+
														
 
															+
														
 
															+    # 15.夜光与空气污染暴露变化轨迹（潜增长模型等）与疾病的关联性研究
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("15.csv", index=False)       
														
 
															+
														
 
															+    # 17.基于全环境暴露组的疾病发生风险预测模型构建及验证
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + meteorology_features + disease_features]
														
 
															+    one_data.to_csv("17.csv", index=False)   
														
 
															+
														
 
															+    # 18.基于倍差法的夜光暴露与空气污染对疾病发病风险的因果效应
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("18.csv", index=False)   
														
 
															+
														
 
															+    # 19.夜光暴露与空气污染的联合暴露与疾病发生风险的关联性
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
														
 
															+    one_data.to_csv("19.csv", index=False)   
														
 
															+
														
 
															+    # 20.基于深度神经网络的全环境暴露组与疾病的关联性及发生风险预测
														
 
															+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + meteorology_features + disease_features]
														
 
															+    one_data.to_csv("20.csv", index=False)   
														
--- a/README.md
+++ b/README.md
@@ -2,3 +2,9 @@
 
															 CHARLS_P中是对CHARLS数据进行处理的程序
														
 
															 NHANES_P中是对NHANES数据进行处理的程序
														
 
															 AreaCity-Query-Geometry中是对坐标数据进行处理的程序
														
 
															+
														
 
															+基于马尔科夫多状态模型的夜光暴露、空气污染与行为生活方式与共病的关系
														
 
															+
														
 
															+1. 什么是共病，如何定义共病，同时患有两种疾病以上就叫共病？
														
 
															+2. 结局变量为分类变量，状态之间可以相互转化，共病的状态如何定义？
														
 
															+3. 设置状态：健康状态：无任何共病症状的人群。单一疾病状态：患有某一种特定疾病的人群。可以分为不同的疾病类别，如心理疾病（如焦虑、抑郁）或生理疾病（如高血压、糖尿病）。多种疾病共存状态：同时患有两种或多种疾病的状态。例如，焦虑和失眠同时存在，或者糖尿病和心血管疾病共存。恶化状态：共病症状逐渐恶化，病情加重的状态。这一状态通常会与疾病的进展或夜光暴露程度相关。缓解状态：共病症状缓解，患者恢复部分或完全健康的状态。死亡状态：
														
--- a/test.py
+++ b/test.py
@@ -49,10 +49,10 @@ import pandas as pd
 
															 #         print(f"Error reading {file_path}: {e}")
														
 
															 # print(num)
														
 
															-# data = pd.read_csv("/root/r_base/CHARLS/result_all.csv")
														
 
															+# data = pd.read_csv("/root/r_base/CHARLS/result_all_new.csv")
														
 
															 # print(data.shape)
														
 
															-# # 去重并统计ID个数
														
 
															-# unique_ids = data.drop_duplicates(subset=["householdID"])
														
 
															+# # # 去重并统计ID个数
														
 
															+# unique_ids = data.drop_duplicates(subset=["ID","communityID"])
														
 
															 # count_unique_ids = unique_ids.count()
														
 
															 # print(count_unique_ids)
														
 
															 # 指定文件夹路径
														
@@ -74,4 +74,60 @@ import pandas as pd
 
															 #         num += df.shape[1]
														
 
															 #     except Exception as e:
														
 
															 #         print(f"Error reading {file_path}: {e}")
														
 
															-# print(num)
														
 
															+# print(num)
														
 
															+import pyreadstat
														
 
															+import numpy as np
														
 
															+
														
 
															+year = "2018"
														
 
															+cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
														
 
															+#计算认知功能得分，分成三部分：电话问卷10分，词语回忆10分、画图1分
														
 
															+cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
														
 
															+cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
														
 
															+cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
														
 
															+cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
														
 
															+cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
														
 
															+
														
 
															+#词语记忆
														
 
															+cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
														
 
															+cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
														
 
															+cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
														
 
															+cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) 
														
 
															+cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) 
														
 
															+cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)                                            
														
 
															+cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) 
														
 
															+cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) 
														
 
															+cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)                                            
														
 
															+cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)                                           
														
 
															+# cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
														
 
															+cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) 
														
 
															+cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) 
														
 
															+cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) 
														
 
															+cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) 
														
 
															+cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) 
														
 
															+cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)                                            
														
 
															+cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) 
														
 
															+cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) 
														
 
															+cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)                                            
														
 
															+cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)                                            
														
 
															+# cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
														
 
															+#画图
														
 
															+cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
														
 
															+
														
 
															+cognition["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
														
 
															+    cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
														
 
															+    cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
														
 
															+    cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
														
 
															+    cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
														
 
															+    cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
														
 
															+    cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
														
 
															+    cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
														
 
															+    cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
														
 
															+    cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
														
 
															+    cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
														
 
															+    cognition["draw_score"]
														
 
															+cognition.to_csv("/root/r_base/CHARLS/test.csv")