před 2 týdny · 9dd6977e4f
--- a/CHARLS_P/CHARLS_NDVI.py
+++ b/CHARLS_P/CHARLS_NDVI.py
@@ -4,9 +4,9 @@ if __name__ == "__main__":
 
				     years = [2011, 2013,2015, 2018, 2020]
			
 
				 
			
 
				     #读取CHARLS数据
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd.csv",index=False)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				+    CHARLS_data.to_csv("CHARLS_data_p_n_m_nd.csv",index=False)
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m_nd.csv")
			
 
				     
			
 
				     #读取NDVI数据
			
 
				     ndvi_data = pd.read_excel(f"NDVI/【立方数据学社】地级市等级的逐年NDVI.xlsx")
			
@@ -23,4 +23,4 @@ if __name__ == "__main__":
 
				         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_ndvi'] = table_merge[str(year-1)].values
			
 
				         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_ndvi'] = table_merge[str(year-2)].values
			
 
				         print(year)
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd.csv",index=False)
			
 
				+    CHARLS_data.to_csv("CHARLS_data_p_n_m_nd.csv",index=False)
			
--- a/CHARLS_P/CHARLS_NL.py
+++ b/CHARLS_P/CHARLS_NL.py
@@ -16,5 +16,5 @@ for year in years:
 
				     #更新CHARLS表
			
 
				     CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_nl'] = table_merge[str(year-1)].values
			
 
				     CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_nl'] = table_merge[str(year-2)].values
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
			
 
				+    CHARLS_data.to_csv("CHARLS_data_p_n.csv",index=False)
			
 
				     print(year)
			
--- a/CHARLS_P/CHARLS_PM.py
+++ b/CHARLS_P/CHARLS_PM.py
@@ -5,7 +5,7 @@ import os
 
				 def pollutant_handle(path):
			
 
				     years = [2011, 2013,2015, 2018, 2020]
			
 
				     #读取污染物数据
			
 
				-    pollutants_data = pd.read_csv("pollution/result_pm10_1km_p.csv")
			
 
				+    pollutants_data = pd.read_csv("pollution/result_O3_p.csv")
			
 
				     for year in years:
			
 
				         CHARLS_data = pd.read_csv(path)
			
 
				         print(CHARLS_data.info())
			
@@ -15,9 +15,9 @@ def pollutant_handle(path):
 
				         table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
			
 
				         if str(year - 1) in table_merge.columns:
			
 
				             #更新CHARLS表
			
 
				-            CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_pm10'] = table_merge[str(year-1)].values
			
 
				+            CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
			
 
				         if str(year - 2) in table_merge.columns:
			
 
				-            CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_pm10'] = table_merge[str(year-2)].values
			
 
				+            CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
			
 
				         CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
			
 
				         print(year)
			
 
				 
			
--- a/CHARLS_P/CHARLS_exit.py
+++ b/CHARLS_P/CHARLS_exit.py
@@ -1,24 +0,0 @@
 
				-import pandas as pd
			
 
				-import pyreadstat
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    #读取CHARLS数据
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_exit.csv",index=False)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_exit.csv")
			
 
				-    
			
 
				-    #增加一列死亡状态
			
 
				-    #0：未死亡
			
 
				-    #1：死亡 
			
 
				-    #读取2013年的死亡数据
			
 
				-    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/Exit_Interview.dta")
			
 
				-    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
			
 
				-    exit["exit_year"] = exit["exb001_1"]
			
 
				-    CHARLS_data = pd.merge(CHARLS_data, exit[['ID', "exit_year"]], on = "ID", how="left")
			
 
				-
			
 
				-    #读取2020年的死亡数据
			
 
				-    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2020/Exit_Module.dta")
			
 
				-    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
			
 
				-    exit["exit_year"] = exit["exb001_1"]
			
 
				-    CHARLS_data = pd.merge(CHARLS_data, exit[['ID', "exit_year"]], on = "ID", how="left")
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_exit.csv",index=False)
			
--- a/CHARLS_P/CHARLS_harmonized.py
+++ b/CHARLS_P/CHARLS_harmonized.py
@@ -84,7 +84,7 @@ def calculate_age(row):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m_nd.csv")
			
 
				     harmonized['ID'] = harmonized['ID'].astype(str)  # 转换为字符串
			
 
				     CHARLS_data['ID'] = CHARLS_data['ID'].astype(str)  # 转换为字符串
			
 
				     #婚姻状况
			
@@ -201,7 +201,7 @@ if __name__ == "__main__":
 
				     # 0 below high school
			
 
				     # 1 high school
			
 
				     # 2 college or above
			
 
				-    harmonized["raeduc_c"] = harmonized["raeduc_c"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10] else 0 if x in [1,2,3,4,5] else np.nan)
			
 
				+    harmonized["raeduc_c"] = harmonized["raeduc_c"].apply(lambda x : 1 if x in [3,4,5,6,7,8, 9, 10] else 0 if x in [1,2] else np.nan)
			
 
				     CHARLS_data = pd.merge(CHARLS_data, harmonized[["ID", "ragender", "rabyear", "raeduc_c"]], on='ID', how='left')
			
 
				 
			
 
				     #合并
			
@@ -252,5 +252,4 @@ if __name__ == "__main__":
 
				     #重新计算年龄
			
 
				     CHARLS_data["age"] = CHARLS_data.apply(calculate_age, axis=1)
			
 
				 
			
 
				-    
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv", index=False)
			
 
				+    CHARLS_data.to_csv("CHARLS_data_p_n_m_nd_h.csv", index=False)
			
--- a/CHARLS_P/CHARLS_meteorology.py
+++ b/CHARLS_P/CHARLS_meteorology.py
@@ -107,17 +107,17 @@ def humidity(CHARLS_data):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     #读取CHARLS数据
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants.csv")
			
 
				-    CHARLS_data.to_csv("CHARLS_data_pollutants_mete.csv",index=False)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n.csv")
			
 
				+    CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				     sunlight(CHARLS_data)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				     wind(CHARLS_data)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				     rain(CHARLS_data)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				     temperature(CHARLS_data)
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m.csv")
			
 
				     humidity(CHARLS_data)
			
 
				 
			
 
				 
			
--- a/paper_code/CHARLS_preprocess_main_paper.py
+++ b/paper_code/CHARLS_preprocess_main_paper.py
@@ -0,0 +1,1078 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import pyreadstat
			
 
				+from datetime import date
			
 
				+from lunarcalendar import Converter, Lunar
			
 
				+
			
 
				+#统一列名
			
 
				+def change_columns(df):
			
 
				+    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
			
 
				+                  "waist", "Systolic","Diastolic",
			
 
				+
			
 
				+                  'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
			
 
				+                  'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
			
 
				+
			
 
				+                  'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
			
 
				+                  
			
 
				+                  'Physical_activity',
			
 
				+                  
			
 
				+                  'Smoke','Drink',
			
 
				+                  
			
 
				+                  "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
			
 
				+                  ]
			
 
				+# 2020年把帕金森和记忆病症分开，需要和以前对齐   
			
 
				+def process_row(row):
			
 
				+    da002_12_ = row['da003_12_']
			
 
				+    da002_13_ = row['da003_13_']
			
 
				+    
			
 
				+    if da002_12_ == 1 or da002_13_ == 1:
			
 
				+        return 1
			
 
				+    elif da002_12_ == 2 and da002_13_ == 2:
			
 
				+        return 2
			
 
				+    elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
			
 
				+        return 2
			
 
				+    elif pd.isna(da002_12_) and pd.isna(da002_13_):
			
 
				+        return np.nan
			
 
				+    else:
			
 
				+        return np.nan  # 预防万一，其余情况下设为NA
			
 
				+    
			
 
				+def update_da051(value):
			
 
				+    if value == 1:
			
 
				+        return 3
			
 
				+    elif value == 3:
			
 
				+        return 1
			
 
				+    else:
			
 
				+        return value
			
 
				+    
			
 
				+if __name__ == "__main__":
			
 
				+    # 2011年
			
 
				+    year = "2011"
			
 
				+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
			
 
				+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
			
 
				+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
			
 
				+    blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
			
 
				+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
			
 
				+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
			
 
				+    exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
			
 
				+    #性别#年龄#居住地#婚姻状况
			
 
				+    # 1 married or partnered
			
 
				+    # 0 other marital status (separated, divorced, unmarried, or widowed)
			
 
				+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
			
 
				+    
			
 
				+    #教育
			
 
				+    # 0 not finish primary school/No formal education illiterate
			
 
				+    # 1 finish primary school
			
 
				+    demo["education"] = demo["bd001"].apply(lambda x : 1 if x in [3,4,5,6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+    
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
			
 
				+
			
 
				+    #居住地
			
 
				+    data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				+
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    #腰围
			
 
				+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    #血压测量后两次的平均
			
 
				+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
			
 
				+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
			
 
				+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic']]
			
 
				+    data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
			
 
				+
			
 
				+    #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
			
 
				+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
			
 
				+    blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
			
 
				+    data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
			
 
				+    # 慢性病：
			
 
				+    # (1)  Hypertension 高血压病    
			
 
				+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常（包括低密度脂蛋白、甘油三酯、总胆固醇的升高或（和）高密度脂蛋白的下降）
			
 
				+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高（包括糖耐量异常和空腹血糖升高）
			
 
				+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤（不包括轻度皮肤癌）
			
 
				+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病（不包括肿瘤或癌）
			
 
				+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
			
 
				+    # （除脂肪肝、肿瘤或癌外）
			
 
				+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病（如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病）
			
 
				+    # (8)	 Stroke  中风
			
 
				+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病（不包括肿瘤或癌）
			
 
				+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病（不包括肿瘤或癌）
			
 
				+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
			
 
				+    # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
			
 
				+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
			
 
				+    # (14)  Asthma  哮喘
			
 
				+
			
 
				+    # 体力活动
			
 
				+    # 2 vigorous (vigorous activity more than once a week)
			
 
				+    # 1 moderate (moderate activity more than once a week)
			
 
				+    # 0 inactive (the rest)
			
 
				+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
			
 
				+                                                             1 if x["da051_2_"]==1 else 
			
 
				+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
			
 
				+                                                             else np.nan ,axis=1)
			
 
				+    # 抽烟
			
 
				+    # 1 抽过烟
			
 
				+    # 0 没有抽过烟
			
 
				+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
			
 
				+
			
 
				+    # 喝酒
			
 
				+    # 1 喝过酒
			
 
				+    # 0 没有喝过酒
			
 
				+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
			
 
				+                                                 0 if x["da069"] == 1 else 
			
 
				+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
			
 
				+
			
 
				+    health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
			
 
				+                                   ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
			
 
				+                                   ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"]]
			
 
				+    
			
 
				+    data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
			
 
				+
			
 
				+    
			
 
				+    #计算认知功能得分，分成三部分：电话问卷9分，词语回忆20分、画图1分
			
 
				+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
			
 
				+
			
 
				+    #词语记忆
			
 
				+    health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
			
 
				+    # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
			
 
				+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
			
 
				+    #画图
			
 
				+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+
			
 
				+    data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
			
 
				+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
			
 
				+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
			
 
				+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
			
 
				+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
			
 
				+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
			
 
				+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
			
 
				+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
			
 
				+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
			
 
				+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
			
 
				+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
			
 
				+        health_status["draw_score"]
			
 
				+    #心理得分
			
 
				+    health_status["dc009_score"] = health_status["dc009"]-1
			
 
				+    health_status["dc010_score"] = health_status["dc010"]-1
			
 
				+    health_status["dc011_score"] = health_status["dc011"]-1
			
 
				+    health_status["dc012_score"] = health_status["dc012"]-1   
			
 
				+    health_status["dc013_score"] = 4 - health_status["dc013"] 
			
 
				+    health_status["dc014_score"] = health_status["dc014"]-1   
			
 
				+    health_status["dc015_score"] = health_status["dc015"]-1   
			
 
				+    health_status["dc016_score"] = 4 - health_status["dc016"]
			
 
				+    health_status["dc017_score"] = health_status["dc017"]-1   
			
 
				+    health_status["dc018_score"] = health_status["dc018"]-1 
			
 
				+    data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
			
 
				+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
			
 
				+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
			
 
				+    #睡眠状态
			
 
				+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
			
 
				+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
			
 
				+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				+    data_2011["sleep_state"] = health_status['dc015']
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+
			
 
				+    data_2011["wave"] = year
			
 
				+    change_columns(data_2011)
			
 
				+    # 2011年的ID和其他年份有一点区别，倒数第三位加0
			
 
				+    data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
			
 
				+
			
 
				+    # 2013年
			
 
				+    year = "2013"
			
 
				+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
			
 
				+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
			
 
				+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
			
 
				+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				+    exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
			
 
				+
			
 
				+    #性别#年龄#婚姻状况
			
 
				+    # 1 married or partnered
			
 
				+    # 0 other marital status (separated, divorced, unmarried, or widowed)
			
 
				+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
			
 
				+
			
 
				+    #教育
			
 
				+    # 0 not finish primary school/No formal education illiterate
			
 
				+    # 1 finish primary school
			
 
				+
			
 
				+    # 纠正2011年统计错误的教育
			
 
				+    demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
			
 
				+    demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+    education_correct = demo[['ID',"education_correct"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
			
 
				+    # 删除多余的列
			
 
				+    data_2011 = data_2011.drop(columns=['education_correct'])
			
 
				+
			
 
				+    #更新2013的教育
			
 
				+    demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
			
 
				+    demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+    #合并2011年的教育
			
 
				+    eductaion_2011 = data_2011[['ID',"education"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
			
 
				+
			
 
				+    # 纠正2011年统计错误的出生年
			
 
				+    demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
			
 
				+    birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
			
 
				+    data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
			
 
				+    # 删除多余的列
			
 
				+    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
			
 
				+    #合并2011年的出生年
			
 
				+    birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
			
 
				+    demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
			
 
				+
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
			
 
				+
			
 
				+    #居住地
			
 
				+    data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				+
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    #腰围
			
 
				+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    #血压测量后两次的平均
			
 
				+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
			
 
				+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
			
 
				+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic']]
			
 
				+    data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
			
 
				+
			
 
				+    #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
			
 
				+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
			
 
				+    data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
			
 
				+    
			
 
				+    # 慢性病：
			
 
				+    # (1)  Hypertension 高血压病    
			
 
				+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常（包括低密度脂蛋白、甘油三酯、总胆固醇的升高或（和）高密度脂蛋白的下降）
			
 
				+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高（包括糖耐量异常和空腹血糖升高）
			
 
				+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤（不包括轻度皮肤癌）
			
 
				+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病（不包括肿瘤或癌）
			
 
				+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
			
 
				+    # （除脂肪肝、肿瘤或癌外）
			
 
				+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病（如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病）
			
 
				+    # (8)	 Stroke  中风
			
 
				+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病（不包括肿瘤或癌）
			
 
				+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病（不包括肿瘤或癌）
			
 
				+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
			
 
				+    # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
			
 
				+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
			
 
				+    # (14)  Asthma  哮喘
			
 
				+
			
 
				+
			
 
				+    # 体力活动
			
 
				+    # 2 vigorous (vigorous activity more than once a week)
			
 
				+    # 1 moderate (moderate activity more than once a week)
			
 
				+    # 0 inactive (the rest)
			
 
				+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
			
 
				+                                                             1 if x["da051_2_"]==1 else 
			
 
				+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
			
 
				+                                                             else np.nan ,axis=1)
			
 
				+    
			
 
				+    # 抽烟
			
 
				+    # 1 抽过烟
			
 
				+    # 0 没有抽过烟
			
 
				+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
			
 
				+
			
 
				+    # 喝酒
			
 
				+    # 1 喝过酒
			
 
				+    # 0 没有喝过酒
			
 
				+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
			
 
				+                                                 0 if x["da069"] == 1 else 
			
 
				+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
			
 
				+    
			
 
				+    # 合并2011年的慢性病
			
 
				+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
			
 
				+                                   ,'da007_12_','da007_13_','da007_14_']
			
 
				+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
			
 
				+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
			
 
				+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
			
 
				+    
			
 
				+    diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
			
 
				+
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    for col in columns_to_diseases_new:
			
 
				+        health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
			
 
				+
			
 
				+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
			
 
				+    
			
 
				+    data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
			
 
				+
			
 
				+    #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
			
 
				+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
			
 
				+
			
 
				+    #词语记忆
			
 
				+    health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
			
 
				+    # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
			
 
				+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
			
 
				+    #画图
			
 
				+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+
			
 
				+    data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
			
 
				+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
			
 
				+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
			
 
				+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
			
 
				+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
			
 
				+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
			
 
				+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
			
 
				+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
			
 
				+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
			
 
				+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
			
 
				+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
			
 
				+        health_status["draw_score"]
			
 
				+    #心理得分
			
 
				+    health_status["dc009_score"] = health_status["dc009"]-1
			
 
				+    health_status["dc010_score"] = health_status["dc010"]-1
			
 
				+    health_status["dc011_score"] = health_status["dc011"]-1
			
 
				+    health_status["dc012_score"] = health_status["dc012"]-1   
			
 
				+    health_status["dc013_score"] = 4 - health_status["dc013"] 
			
 
				+    health_status["dc014_score"] = health_status["dc014"]-1   
			
 
				+    health_status["dc015_score"] = health_status["dc015"]-1   
			
 
				+    health_status["dc016_score"] = 4 - health_status["dc016"]
			
 
				+    health_status["dc017_score"] = health_status["dc017"]-1   
			
 
				+    health_status["dc018_score"] = health_status["dc018"]-1 
			
 
				+    data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
			
 
				+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
			
 
				+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
			
 
				+    
			
 
				+    #睡眠状态
			
 
				+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
			
 
				+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
			
 
				+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				+    data_2013["sleep_state"] = health_status['dc015']
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				+    data_2013["wave"] = year
			
 
				+    change_columns(data_2013)
			
 
				+    data_2013 = pd.concat([data_2011, data_2013], axis=0)
			
 
				+
			
 
				+    # 2015年
			
 
				+    year = "2015"
			
 
				+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
			
 
				+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
			
 
				+    blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
			
 
				+    biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
			
 
				+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				+
			
 
				+    #性别#年龄#婚姻状况
			
 
				+    # 1 married or partnered
			
 
				+    # 0 other marital status (separated, divorced, unmarried, or widowed)
			
 
				+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
			
 
				+    
			
 
				+    #教育
			
 
				+    # 0 not finish primary school/No formal education illiterate
			
 
				+    # 1 finish primary school
			
 
				+    #更新2015的教育
			
 
				+    demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
			
 
				+    demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+    #合并2013年的教育
			
 
				+    eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
			
 
				+
			
 
				+    # 2015年的出生年
			
 
				+    demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
			
 
				+    
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				+
			
 
				+    #居住地
			
 
				+    data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				+
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    #腰围
			
 
				+    biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) 
			
 
				+    #血压测量后两次的平均
			
 
				+    biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) 
			
 
				+    biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) 
			
 
				+    biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
			
 
				+    biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic']]
			
 
				+    data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
			
 
				+
			
 
				+    #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
			
 
				+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
			
 
				+    blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
			
 
				+    data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
			
 
				+    
			
 
				+    # 慢性病：
			
 
				+    # (1)  Hypertension 高血压病    
			
 
				+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常（包括低密度脂蛋白、甘油三酯、总胆固醇的升高或（和）高密度脂蛋白的下降）
			
 
				+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高（包括糖耐量异常和空腹血糖升高）
			
 
				+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤（不包括轻度皮肤癌）
			
 
				+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病（不包括肿瘤或癌）
			
 
				+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
			
 
				+    # （除脂肪肝、肿瘤或癌外）
			
 
				+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病（如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病）
			
 
				+    # (8)	 Stroke  中风
			
 
				+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病（不包括肿瘤或癌）
			
 
				+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病（不包括肿瘤或癌）
			
 
				+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
			
 
				+    # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
			
 
				+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
			
 
				+    # (14)  Asthma  哮喘
			
 
				+
			
 
				+    # 体力活动
			
 
				+    # 2 vigorous (vigorous activity more than once a week)
			
 
				+    # 1 moderate (moderate activity more than once a week)
			
 
				+    # 0 inactive (the rest)
			
 
				+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
			
 
				+                                                             1 if x["da051_2_"]==1 else 
			
 
				+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
			
 
				+                                                             else np.nan ,axis=1)
			
 
				+    
			
 
				+    # 抽烟
			
 
				+    # 1 抽过烟
			
 
				+    # 0 没有抽过烟
			
 
				+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
			
 
				+
			
 
				+    # 喝酒
			
 
				+    # 1 喝过酒
			
 
				+    # 0 没有喝过酒
			
 
				+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
			
 
				+                                                 0 if x["da069"] == 1 else 
			
 
				+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
			
 
				+
			
 
				+    # 合并2013年的慢性病
			
 
				+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
			
 
				+                                   ,'da007_12_','da007_13_','da007_14_']
			
 
				+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
			
 
				+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
			
 
				+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
			
 
				+    
			
 
				+    diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
			
 
				+
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    for col in columns_to_diseases_new:
			
 
				+        health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
			
 
				+
			
 
				+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
			
 
				+    
			
 
				+    data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
			
 
				+
			
 
				+    #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
			
 
				+    health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
			
 
				+
			
 
				+    #词语记忆
			
 
				+    health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
			
 
				+    health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                           
			
 
				+    # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) 
			
 
				+    health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)                                            
			
 
				+    health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)                                            
			
 
				+    # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
			
 
				+    #画图
			
 
				+    health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+
			
 
				+    data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
			
 
				+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
			
 
				+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
			
 
				+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
			
 
				+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
			
 
				+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
			
 
				+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
			
 
				+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
			
 
				+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
			
 
				+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
			
 
				+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
			
 
				+        health_status["draw_score"]
			
 
				+    #心理得分
			
 
				+    health_status["dc009_score"] = health_status["dc009"]-1
			
 
				+    health_status["dc010_score"] = health_status["dc010"]-1
			
 
				+    health_status["dc011_score"] = health_status["dc011"]-1
			
 
				+    health_status["dc012_score"] = health_status["dc012"]-1   
			
 
				+    health_status["dc013_score"] = 4 - health_status["dc013"] 
			
 
				+    health_status["dc014_score"] = health_status["dc014"]-1   
			
 
				+    health_status["dc015_score"] = health_status["dc015"]-1   
			
 
				+    health_status["dc016_score"] = 4 - health_status["dc016"]
			
 
				+    health_status["dc017_score"] = health_status["dc017"]-1   
			
 
				+    health_status["dc018_score"] = health_status["dc018"]-1 
			
 
				+    data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
			
 
				+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
			
 
				+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
			
 
				+    #睡眠状态
			
 
				+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
			
 
				+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
			
 
				+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				+    data_2015["sleep_state"] = health_status['dc015']
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				+    data_2015["wave"] = year
			
 
				+    change_columns(data_2015)
			
 
				+    data_2015 = pd.concat([data_2013, data_2015], axis=0)
			
 
				+
			
 
				+    # 2018年
			
 
				+    year = "2018"
			
 
				+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
			
 
				+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
			
 
				+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				+    health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
			
 
				+    cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				+
			
 
				+    #性别#年龄#婚姻状况
			
 
				+    # 1 married or partnered
			
 
				+    # 0 other marital status (separated, divorced, unmarried, or widowed)
			
 
				+    demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
			
 
				+
			
 
				+    #教育
			
 
				+    # 0 not finish primary school/No formal education illiterate
			
 
				+    # 1 finish primary school
			
 
				+    #更新2015的教育
			
 
				+    demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
			
 
				+    demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+
			
 
				+    # 出生年
			
 
				+    demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
			
 
				+
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				+
			
 
				+    #居住地
			
 
				+    data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				+
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
			
 
				+
			
 
				+    #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
			
 
				+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
			
 
				+    data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
			
 
				+    
			
 
				+    # 慢性病：
			
 
				+    # (1)  Hypertension 高血压病    
			
 
				+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常（包括低密度脂蛋白、甘油三酯、总胆固醇的升高或（和）高密度脂蛋白的下降）
			
 
				+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高（包括糖耐量异常和空腹血糖升高）
			
 
				+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤（不包括轻度皮肤癌）
			
 
				+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病（不包括肿瘤或癌）
			
 
				+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
			
 
				+    # （除脂肪肝、肿瘤或癌外）
			
 
				+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病（如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病）
			
 
				+    # (8)	 Stroke  中风
			
 
				+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病（不包括肿瘤或癌）
			
 
				+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病（不包括肿瘤或癌）
			
 
				+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
			
 
				+    # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
			
 
				+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
			
 
				+    # (14)  Asthma  哮喘
			
 
				+
			
 
				+    # 体力活动
			
 
				+    # 2 vigorous (vigorous activity more than once a week)
			
 
				+    # 1 moderate (moderate activity more than once a week)
			
 
				+    # 0 inactive (the rest)
			
 
				+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 
			
 
				+                                                             1 if x["da051_2_"]==1 else 
			
 
				+                                                             0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) 
			
 
				+                                                             else np.nan ,axis=1)
			
 
				+    
			
 
				+    # 抽烟
			
 
				+    # 1 抽过烟
			
 
				+    # 0 没有抽过烟
			
 
				+    health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
			
 
				+
			
 
				+    # 喝酒
			
 
				+    # 1 喝过酒
			
 
				+    # 0 没有喝过酒
			
 
				+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 
			
 
				+                                                 0 if x["da069"] == 1 else 
			
 
				+                                                 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
			
 
				+    
			
 
				+    columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
			
 
				+                                   ,'da007_12_','da007_13_','da007_14_']
			
 
				+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
			
 
				+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
			
 
				+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
			
 
				+    
			
 
				+    diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
			
 
				+
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    for col in columns_to_diseases_new:
			
 
				+        health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
			
 
				+
			
 
				+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
			
 
				+
			
 
				+    data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
			
 
				+
			
 
				+    #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
			
 
				+    cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+    cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+    cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+    cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+    # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+    cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
			
 
				+    cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
			
 
				+    cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
			
 
				+    cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
			
 
				+    cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
			
 
				+
			
 
				+    #词语记忆
			
 
				+    cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
			
 
				+    cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
			
 
				+    cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
			
 
				+    cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) 
			
 
				+    cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) 
			
 
				+    cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)                                            
			
 
				+    cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) 
			
 
				+    cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) 
			
 
				+    cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)                                            
			
 
				+    cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)                                           
			
 
				+    # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
			
 
				+    cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) 
			
 
				+    cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) 
			
 
				+    cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) 
			
 
				+    cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) 
			
 
				+    cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) 
			
 
				+    cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)                                            
			
 
				+    cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) 
			
 
				+    cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) 
			
 
				+    cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)                                            
			
 
				+    cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)                                            
			
 
				+    # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
			
 
				+    #画图
			
 
				+    cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
			
 
				+
			
 
				+    data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
			
 
				+        cognition["dc001s3_score"] + cognition["dc002_score"]+ \
			
 
				+        cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
			
 
				+        cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
			
 
				+        cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
			
 
				+        cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
			
 
				+        cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
			
 
				+        cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
			
 
				+        cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
			
 
				+        cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
			
 
				+        cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
			
 
				+        cognition["draw_score"]
			
 
				+    #心理得分
			
 
				+    cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) 
			
 
				+    cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) 
			
 
				+    data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
			
 
				+        cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
			
 
				+        cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
			
 
				+    #睡眠状态
			
 
				+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
			
 
				+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
			
 
				+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				+    data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) 
			
 
				+    
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				+    data_2018["wave"] = year
			
 
				+    change_columns(data_2018)
			
 
				+    data_2018 = pd.concat([data_2015, data_2018], axis=0)
			
 
				+
			
 
				+    # 2020年
			
 
				+    year = "2020"
			
 
				+    demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
			
 
				+    psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
			
 
				+    health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
			
 
				+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
			
 
				+
			
 
				+    #性别#年龄#婚姻状况
			
 
				+    # 1 married or partnered
			
 
				+    # 0 other marital status (separated, divorced, unmarried, or widowed)
			
 
				+    demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
			
 
				+
			
 
				+    #教育
			
 
				+    # 0 not finish primary school/No formal education illiterate
			
 
				+    # 1 finish primary school
			
 
				+    demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
			
 
				+    demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
			
 
				+    #合并2018年的教育
			
 
				+    eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
			
 
				+
			
 
				+    # 出生年
			
 
				+    demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
			
 
				+    demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
			
 
				+    #合并2018年的出生年
			
 
				+    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
			
 
				+    demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
			
 
				+
			
 
				+    #获取随访时间
			
 
				+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
			
 
				+
			
 
				+    demo["ba003"] = 1
			
 
				+
			
 
				+    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
			
 
				+    #居住地
			
 
				+    data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
			
 
				+
			
 
				+    #身高#体重#收缩压#舒张压
			
 
				+    data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
			
 
				+
			
 
				+    #白细胞（WBC），平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
			
 
				+    #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
			
 
				+    data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
			
 
				+    
			
 
				+    # 慢性病：
			
 
				+    # (1)  Hypertension 高血压病    
			
 
				+    # (2)	Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常（包括低密度脂蛋白、甘油三酯、总胆固醇的升高或（和）高密度脂蛋白的下降）
			
 
				+    # (3)	Diabetes or high blood sugar糖尿病或血糖升高（包括糖耐量异常和空腹血糖升高）
			
 
				+    # (4)	Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤（不包括轻度皮肤癌）
			
 
				+    # (5)	Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病（不包括肿瘤或癌）
			
 
				+    #        (6)  Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
			
 
				+    # （除脂肪肝、肿瘤或癌外）
			
 
				+    # (7)	Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病（如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病）
			
 
				+    # (8)	 Stroke  中风
			
 
				+    # (9)	 Kidney disease (except for tumor or cancer) 肾脏疾病（不包括肿瘤或癌）
			
 
				+    # (10)	 Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病（不包括肿瘤或癌）
			
 
				+    # (11)	 Emotional, nervous, or psychiatric problems 情感及精神方面问题 
			
 
				+    # (12)	 Memory-related disease 与记忆相关的疾病 （如老年痴呆症、脑萎缩、帕金森症）
			
 
				+    # (13)	 Arthritis or rheumatism 关节炎或风湿病
			
 
				+    # (14)  Asthma  哮喘
			
 
				+    # 2020年把帕金森和记忆病症分开，需要和以前对齐
			
 
				+
			
 
				+    # 体力活动
			
 
				+    # 2 vigorous (vigorous activity more than once a week)
			
 
				+    # 1 moderate (moderate activity more than once a week)
			
 
				+    # 0 inactive (the rest)
			
 
				+    health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else 
			
 
				+                                                             1 if x["da032_2_"]==1 else 
			
 
				+                                                             0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2) 
			
 
				+                                                             else np.nan ,axis=1)
			
 
				+    
			
 
				+    # 抽烟
			
 
				+    # 1 抽过烟
			
 
				+    # 0 没有抽过烟
			
 
				+    health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
			
 
				+
			
 
				+    # 喝酒
			
 
				+    # 1 喝过酒
			
 
				+    # 0 没有喝过酒
			
 
				+    health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else 
			
 
				+                                                 0 if x["da051"] == 3 else np.nan, axis=1)
			
 
				+
			
 
				+    health_status['da003_12_'] = health_status.apply(process_row, axis=1)
			
 
				+
			
 
				+    columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
			
 
				+                                   ,'da003_12_','da003_14_','da003_15_']
			
 
				+    columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
			
 
				+    for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
			
 
				+        health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
			
 
				+    
			
 
				+    diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
			
 
				+
			
 
				+    # 按 'ID' 列合并两个表
			
 
				+    health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
			
 
				+    # 使用 fillna() 来更新字段
			
 
				+    for col in columns_to_diseases_new:
			
 
				+        health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
			
 
				+
			
 
				+    health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
			
 
				+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
			
 
				+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
			
 
				+    
			
 
				+    data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
			
 
				+
			
 
				+    #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
			
 
				+    health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+    health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+    health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+    health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+    health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+    health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1) 
			
 
				+    health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1) 
			
 
				+    health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
			
 
				+    health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
			
 
				+    health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
			
 
				+
			
 
				+    #词语记忆
			
 
				+    health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
			
 
				+    health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
			
 
				+    health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
			
 
				+    health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1) 
			
 
				+    health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1) 
			
 
				+    health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)                                            
			
 
				+    health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1) 
			
 
				+    health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1) 
			
 
				+    health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)                                            
			
 
				+    health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)                                           
			
 
				+    health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1) 
			
 
				+    health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1) 
			
 
				+    health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1) 
			
 
				+    health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1) 
			
 
				+    health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1) 
			
 
				+    health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)                                            
			
 
				+    health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1) 
			
 
				+    health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1) 
			
 
				+    health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)                                            
			
 
				+    health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)                                            
			
 
				+    #画图
			
 
				+    health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
			
 
				+
			
 
				+    data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
			
 
				+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
			
 
				+        health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
			
 
				+        health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
			
 
				+        health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
			
 
				+        health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
			
 
				+        health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
			
 
				+        health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
			
 
				+        health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
			
 
				+        health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
			
 
				+        health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
			
 
				+        health_status["draw_score"]
			
 
				+    #心理得分
			
 
				+    health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) 
			
 
				+    health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
			
 
				+    health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)   
			
 
				+    health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) 
			
 
				+    data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
			
 
				+        health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
			
 
				+        health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
			
 
				+    
			
 
				+    #睡眠状态
			
 
				+    # (1)Rarely or none of the time (<1 day)  很少或者根本没有（<１天）
			
 
				+    # (2)Some or a little of the time (1-2 days) 不太多（１-２天）
			
 
				+    # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
			
 
				+    # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
			
 
				+    data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) 
			
 
				+
			
 
				+    #ADL
			
 
				+    health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
			
 
				+    data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
			
 
				+                        health_status["db014_score"] + health_status["db015_score"]
			
 
				+    
			
 
				+    data_2020["wave"] = year
			
 
				+    change_columns(data_2020)
			
 
				+    data_2020 = pd.concat([data_2018, data_2020], axis=0)
			
 
				+
			
 
				+    #修改地区名称
			
 
				+    #省份、城市名称和污染物数据格式对齐
			
 
				+    #海东地区->海东市
			
 
				+    data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
			
 
				+    #北京 -> 北京市
			
 
				+    data_2020['city'] = data_2020['city'].replace('北京', '北京市')
			
 
				+    data_2020['province'] = data_2020['province'].replace('北京', '北京市')
			
 
				+    #哈尔滨 -> 哈尔滨市
			
 
				+    data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
			
 
				+    #天津 -> 天津市
			
 
				+    data_2020['city'] = data_2020['city'].replace('天津', '天津市')
			
 
				+    data_2020['province'] = data_2020['province'].replace('天津', '天津市')
			
 
				+    #广西省 -> 广西壮族自治区
			
 
				+    data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
			
 
				+    #巢湖市 -> 合肥市
			
 
				+    data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
			
 
				+    #襄樊市->襄阳市
			
 
				+    data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') 
			
 
				+
			
 
				+    data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
			
 
				+    print(123)
			
--- a/paper_code/code.R
+++ b/paper_code/code.R
@@ -2,20 +2,48 @@
 
				 
			
 
				 library(msm)
			
 
				 library(survival)
			
 
				+library(dplyr)
			
 
				 
			
 
				 # View(cav)
			
 
				 
			
 
				 data <- read.csv("paper_data.csv")
			
 
				 
			
 
				-# data$age_group <- cut(data$age,
			
 
				-#                       breaks = c(45, 55, 65, Inf),
			
 
				-#                       labels = c("45-54", "55-64",">=65"),
			
 
				-#                       right = FALSE)
			
 
				 
			
 
				-data$gender_ <- factor(data$rgender, levels = c(1, 2), labels = c("男", "女"))
			
 
				+#性别
			
 
				+data$rgender_group <- factor(data$rgender, levels = c(1, 2), labels = c("Male", "Female"))
			
 
				 
			
 
				-summary(data$age_group)
			
 
				-# View(data[, c("age", "age_group")])
			
 
				+#年齡
			
 
				+data$age_group <- cut(data$age,
			
 
				+                      breaks = c(45, 55, 65, Inf),
			
 
				+                      labels = c("45-54", "55-64",">=65"),
			
 
				+                      right = FALSE)
			
 
				+#婚姻
			
 
				+data$marital_group <- factor(data$marital_status, levels = c(1, 0), labels = c("married or partnered", "other marital status"))
			
 
				+
			
 
				+
			
 
				+#教育
			
 
				+data$education_group <- factor(data$education, levels = c(0, 1, 2), labels = c("below high school", "high school", "college or above"))
			
 
				+
			
 
				+#运动情况
			
 
				+data$activity_group <- factor(data$Physical_activity, levels = c(0, 1, 2), labels = c("inactive", "moderate", "vigorous"))
			
 
				+
			
 
				+#心理得分
			
 
				+data$psychiatric_group <- cut(data$Psychiatric_score, breaks = c(0, 10, Inf),labels = c("无抑郁", "有抑郁"), right = FALSE)
			
 
				+
			
 
				+#BMI
			
 
				+# 使用 cut() 创建因子类型的变量
			
 
				+data$variable <- cut(data$BMI, breaks = c(0, 18.5, 24, Inf), labels = c("underweight", "normal", "overweight"), right = FALSE)
			
 
				+# 使用 dplyr::recode 直接替换因子水平
			
 
				+data$variable <- recode(data$variable, "underweight" = 1L, "normal" = 0L, "overweight" = 2L)
			
 
				+# 将 data$variable 转换回因子并保留原 levels
			
 
				+data$BMI_group <- factor(data$variable, levels = c(0, 1, 2), labels = c("normal", "underweight", "overweight"))
			
 
				+
			
 
				+#ADL
			
 
				+# 使用 cut() 创建因子类型的变量
			
 
				+data$ADL_group <- cut(data$ADL, breaks = c(0, 1, 3, Inf), labels = c("No impairment", "Mild impairment", "Severe impairment"), right = FALSE)
			
 
				+
			
 
				+summary(data)
			
 
				+View(data[, c("ADL", "variable")])
			
 
				 View(data$age_group)
			
 
				 # 计算状态转移频数表
			
 
				 freq_table <- statetable.msm(state, ID, data = data)
			
@@ -36,11 +64,25 @@ View(crude_init)
 
				 # 进行多状态模型分析
			
 
				 msm_model <- msm(state ~ wave, subject = ID, data = data,
			
 
				                  qmatrix = crude_init,
			
 
				-                 covariates = ~ gender_, 
			
 
				+                 covariates = ~ rgender_group+age_group+marital_group+education_group+activity_group+psychiatric_group+BMI_group+ADL_group+Smoke+Drink+last_year_O3+last_year_pm1+last_year_NO2+last_year_NH4+last_year_nl,
			
 
				                  death = 4,
			
 
				-                 method = "BFGS", control = list(fnscale = 4000, maxit = 10000)
			
 
				+                 method = "BFGS", control = list(fnscale = 5000, maxit = 10000)
			
 
				                  )
			
 
				 
			
 
				+
			
 
				+# 获取模型结果并转换为字符串
			
 
				+model_summary <- capture.output(summary(msm_model))
			
 
				+# 创建一个空的结果文件
			
 
				+result_file <- "msm_model_results_test.txt"
			
 
				+write("", file = result_file)  # 清空文件内容
			
 
				+# 写入文件，附加协变量名称
			
 
				+cat("Results for covariates:", file = result_file, append = TRUE)
			
 
				+cat(model_summary, file = result_file, sep = "\n", append = TRUE)
			
 
				+
			
 
				+
			
 
				+# 查看模型的详细结果
			
 
				+summary(msm_model)
			
 
				+
			
 
				 # 计算状态转移概率矩阵
			
 
				 prob_matrix <- pmatrix.msm(msm_model, t = 5)  # t = 1 代表随访之间的间隔时间
			
 
				 print(prob_matrix)
			
@@ -59,7 +101,32 @@ print(so_journ)
 
				 
			
 
				 # 计算均衡状态概率
			
 
				 
			
 
				-# 查看模型的详细结果
			
 
				-summary(msm_model)
			
 
				-
			
 
				 rm(list = ls())
			
 
				+
			
 
				+
			
 
				+# 定义协变量列表
			
 
				+covariates_list <- list(~last_year_NO3, ~before_last_NO3, ~last_year_NH4, ~before_last_NH4, ~last_year_OM, ~before_last_OM, ~last_year_BC, ~before_last_BC, ~last_year_nl, ~before_last_nl)
			
 
				+
			
 
				+# 创建一个空的结果文件
			
 
				+result_file <- "msm_model_results.txt"
			
 
				+write("", file = result_file)  # 清空文件内容
			
 
				+
			
 
				+# 循环计算不同协变量的模型
			
 
				+for (cov in covariates_list) {
			
 
				+  
			
 
				+  # 运行msm模型
			
 
				+  msm_model <- msm(state ~ wave, subject = ID, data = data,
			
 
				+                   qmatrix = crude_init,
			
 
				+                   covariates = cov,
			
 
				+                   death = 4,
			
 
				+                   method = "BFGS", control = list(fnscale = 4000, maxit = 10000)
			
 
				+                   )
			
 
				+  
			
 
				+  # 获取模型结果并转换为字符串
			
 
				+  model_summary <- capture.output(summary(msm_model))
			
 
				+  
			
 
				+  # 写入文件，附加协变量名称
			
 
				+  cat("Results for covariates:", deparse(cov), "\n", file = result_file, append = TRUE)
			
 
				+  cat(model_summary, file = result_file, sep = "\n", append = TRUE)
			
 
				+  cat("\n\n", file = result_file, append = TRUE)  # 空行分隔每个协变量结果
			
 
				+}
			
--- a/paper_code/data_preprocess.py
+++ b/paper_code/data_preprocess.py
@@ -5,7 +5,7 @@ from sklearn.experimental import enable_iterative_imputer
 
				 from sklearn.impute import IterativeImputer
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
			
 
				+    CHARLS_data = pd.read_csv("CHARLS_data_p_n_m_nd_h.csv")
			
 
				 
			
 
				     cavariates = ["last_year_NO2", 	"before_last_NO2", 	"last_year_O3", "before_last_O3", 
			
 
				                         "last_year_pm1",	"before_last_pm1",	"last_year_pm2.5",	"before_last_pm2.5",	"last_year_pm10",
			
@@ -175,4 +175,5 @@ if __name__ == "__main__":
 
				     data[data < 0] = 0
			
 
				 
			
 
				     #排序将ID相同的放到一起
			
 
				-    data.to_csv("paper_data.csv", index=False)
			
 
				+    data.to_csv("paper_data.csv", index=False)
			
 
				+    
			
--- a/paper_code/figure.ipynb
+++ b/paper_code/figure.ipynb
--- a/paper_code/figure.py
+++ b/paper_code/figure.py
@@ -0,0 +1,39 @@
 
				+import plotly.graph_objects as go
			
 
				+
			
 
				+# 定义节点和颜色
			
 
				+labels = ["No chronic disease", "One chronic disease", "Comorbidity", "Death"]
			
 
				+colors = ["#4C78A8", "#F58518", "#E45756", "#72B7B2"]
			
 
				+
			
 
				+# 定义各年龄段的连接
			
 
				+source = [0, 0, 1, 1, 2, 2, 0, 1, 2,   # 45~54
			
 
				+          0, 0, 1, 1, 2, 2, 0, 1, 2,   # 55~64
			
 
				+          0, 0, 1, 1, 2, 2, 0, 1, 2]   # >=65
			
 
				+
			
 
				+target = [1, 2, 2, 0, 1, 0, 3, 3, 3, 
			
 
				+          1, 2, 2, 0, 1, 0, 3, 3, 3, 
			
 
				+          1, 2, 2, 0, 1, 0, 3, 3, 3]
			
 
				+
			
 
				+# 定义各连接的值
			
 
				+values = [1, 1, 1, 1, 1, 1, 1, 1, 1,   # 45~54
			
 
				+          1.18, 1.57, 1.26, 1.36, 1.46, 1.10, 1.32, 0.89, 2.59,  # 55~64
			
 
				+          1.34, 2.04, 1.52, 1.71, 1.78, 1.44, 9.09, 10.57, 7.52]  # >=65
			
 
				+
			
 
				+# 创建桑基图
			
 
				+fig = go.Figure(go.Sankey(
			
 
				+    node=dict(
			
 
				+        pad=15,
			
 
				+        thickness=20,
			
 
				+        line=dict(color="black", width=0.5),
			
 
				+        label=labels,
			
 
				+        color=colors
			
 
				+    ),
			
 
				+    link=dict(
			
 
				+        source=source,
			
 
				+        target=target,
			
 
				+        value=values,
			
 
				+        color=["#4C78A8", "#F58518", "#E45756"] * 3  # 不同颜色表示年龄段
			
 
				+    )
			
 
				+))
			
 
				+
			
 
				+fig.update_layout(title_text="Transition of Health States Across Age Groups", font_size=10)
			
 
				+fig.show()
			
--- a/paper_code/figure.r
+++ b/paper_code/figure.r
@@ -0,0 +1,180 @@
 
				+# 基线表
			
 
				+# install.packages("compareGroups", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
			
 
				+# install.packages("rmarkdown", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
			
 
				+
			
 
				+library(compareGroups)
			
 
				+library(dplyr)
			
 
				+
			
 
				+data <- read.csv("paper_data.csv")
			
 
				+
			
 
				+#性别
			
 
				+data$Rgender <- factor(data$rgender, levels = c(1, 2), labels = c("Male", "Female"))
			
 
				+
			
 
				+#年齡
			
 
				+data$Age <- cut(data$age,
			
 
				+                      breaks = c(45, 55, 65, Inf),
			
 
				+                      labels = c("45-54", "55-64",">=65"),
			
 
				+                      right = FALSE)
			
 
				+#婚姻
			
 
				+data$Marital_Status <- factor(data$marital_status, levels = c(1, 0), labels = c("married or partnered", "other marital status"))
			
 
				+
			
 
				+#教育
			
 
				+data$Education <- factor(data$education, levels = c(0, 1, 2), labels = c("below high school", "high school", "college or above"))
			
 
				+
			
 
				+#运动情况
			
 
				+data$Physical_Activity <- factor(data$Physical_activity, levels = c(0, 1, 2), labels = c("inactive", "moderate", "vigorous"))
			
 
				+
			
 
				+#心理得分
			
 
				+data$Depressive_Symptoms <- cut(data$Psychiatric_score, breaks = c(0, 10, Inf),labels = c("No", "Yes"), right = FALSE)
			
 
				+
			
 
				+#BMI
			
 
				+# 使用 cut() 创建因子类型的变量
			
 
				+data$BMI <- cut(data$BMI, breaks = c(0, 18.5, 24, Inf), labels = c("underweight", "normal", "overweight"), right = FALSE)
			
 
				+# 使用 dplyr::recode 直接替换因子水平
			
 
				+data$BMI <- recode(data$BMI, "underweight" = 1L, "normal" = 0L, "overweight" = 2L)
			
 
				+# 将 data$variable 转换回因子并保留原 levels
			
 
				+data$BMI <- factor(data$BMI, levels = c(0, 1, 2), labels = c("normal", "underweight", "overweight"))
			
 
				+
			
 
				+#ADL
			
 
				+# 使用 cut() 创建因子类型的变量
			
 
				+data$ADL <- cut(data$ADL, breaks = c(0, 1, 3, Inf), labels = c("No impairment", "Mild impairment", "Severe impairment"), right = FALSE)
			
 
				+
			
 
				+#抽烟
			
 
				+data$Smoke <- factor(data$Smoke, levels = c(0, 1), labels = c("No", "Yes"))
			
 
				+
			
 
				+#喝酒
			
 
				+data$Drink <- factor(data$Drink, levels = c(0, 1), labels = c("No", "Yes"))
			
 
				+
			
 
				+#结局
			
 
				+data$State <- factor(data$state, levels = c(1, 2, 3, 4), labels = c("No chronic disease", "One chronic disease", "Comorbidity", "Death"))
			
 
				+
			
 
				+# 定义分层变量和汇总变量
			
 
				+vars <- c("State", "Age", "Rgender", "Marital_Status", "Education", "BMI","ADL","Smoke","Drink","Physical_Activity", "Depressive_Symptoms", "last_year_NO2", "last_year_O3")
			
 
				+
			
 
				+formula = State ~ Age+Rgender+Marital_Status+Education+BMI+ADL+Smoke+Drink+Physical_Activity+Depressive_Symptoms+last_year_NO2+last_year_O3
			
 
				+# 创建基线表
			
 
				+table = descrTable(formula = formula, data = data, show.p.overall = FALSE)
			
 
				+
			
 
				+export2word(table, file = "基线表.docx")
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 森林图
			
 
				+library(dplyr)
			
 
				+library(ggforce)
			
 
				+library(ggforestplot)
			
 
				+df_compare_traits_groups <- read.csv("output_new.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
			
 
				+
			
 
				+df_logodds <-
			
 
				+  df_compare_traits_groups %>%
			
 
				+  dplyr::arrange(num) %>%
			
 
				+  # Set the study variable to a factor to preserve order of appearance
			
 
				+  # Set class to factor to set order of display.
			
 
				+  dplyr::mutate(
			
 
				+    trait = factor(
			
 
				+      trait,
			
 
				+      levels = c("No chronic disease to One chronic disease", "No chronic disease to Comorbidity",
			
 
				+     "One chronic disease to Comorbidity", "One chronic disease to No chronic disease", "Comorbidity to One chronic disease",
			
 
				+     "Comorbidity to No chronic disease", "No chronic disease to Death", "One chronic disease to Death", "Comorbidity to Death")
			
 
				+    )
			
 
				+  )
			
 
				+
			
 
				+# Draw a forestplot of cross-sectional, linear associations.
			
 
				+p=forestplot(
			
 
				+  df = df_logodds,
			
 
				+  estimate = beta,
			
 
				+  # pvalue = pvalue,
			
 
				+  # psignif = 0,
			
 
				+  logodds = TRUE,
			
 
				+  title = "The impact results of covariates",
			
 
				+  xlab = "Effect Sizie with 95% CI",
			
 
				+  colour = trait,
			
 
				+  shape=trait,
			
 
				+  xlim = c(0.1, 6),
			
 
				+  # You can explicitly define x-tick breaks
			
 
				+  xtickbreaks = c(0.1, 0.5, 0.8, 1.0,1.2, 1.5, 2.0, 3.0,4.0,5.0)
			
 
				+) +
			
 
				+    ggplot2::scale_shape_manual(
			
 
				+    values = c(6L, 6L, 6L, 2L, 2L,2L,21L,21L,21L),
			
 
				+    labels = c("No chronic disease to One chronic disease", "No chronic disease to Comorbidity",
			
 
				+     "One chronic disease to Comorbidity", "One chronic disease to No chronic disease", "Comorbidity to One chronic disease",
			
 
				+     "Comorbidity to No chronic disease", "No chronic disease to Death", "One chronic disease to Death", "Comorbidity to Death")
			
 
				+  )+
			
 
				+  # 修改字体大小
			
 
				+  ggplot2::theme(
			
 
				+    plot.title = ggplot2::element_text(size = 32, face = "bold"),  # 标题字体
			
 
				+    axis.title = ggplot2::element_text(size = 28),  # 轴标题字体
			
 
				+    axis.text = ggplot2::element_text(size = 28),  # 轴刻度字体
			
 
				+    strip.text = ggplot2::element_text(size = 30),  # facet标签字体
			
 
				+    legend.text = ggplot2::element_text(size = 24),  # 图例字体
			
 
				+    legend.title = element_blank(),  # 移除图例标题
			
 
				+    legend.position = "top",  # 默认位置，右边
			
 
				+  )+
			
 
				+  # 使用 facet_wrap 进行分组显示，按 trait 进行分组
			
 
				+  ggplot2::facet_wrap(~group, scales = "free_y", ncol = 1)
			
 
				+
			
 
				+ggsave("forestplot_output.png", plot = p, width = 35, height = 30, dpi = 300, bg = "white")
			
 
				+
			
 
				+
			
 
				+# library(ggplot2)
			
 
				+# library(dplyr)
			
 
				+
			
 
				+# Assuming df_logodds is your data
			
 
				+# df_logodds <- df_compare_traits_groups %>%
			
 
				+#   dplyr::arrange(num) %>%
			
 
				+#   dplyr::mutate(
			
 
				+#     trait = factor(
			
 
				+#       trait,
			
 
				+#       levels = c("No chronic disease to One chronic disease", "No chronic disease to Comorbidity",
			
 
				+#                  "One chronic disease to Comorbidity", "One chronic disease to No chronic disease", 
			
 
				+#                  "Comorbidity to One chronic disease", "Comorbidity to No chronic disease", 
			
 
				+#                  "No chronic disease to Death", "One chronic disease to Death", "Comorbidity to Death")
			
 
				+#     )
			
 
				+#   )
			
 
				+
			
 
				+# # Draw the forestplot using ggplot2
			
 
				+# p <- ggplot(df_logodds, aes(x = beta, y = trait, color = trait, shape = trait)) +
			
 
				+#   # Add points (the estimates)
			
 
				+#   geom_point(size = 5) +
			
 
				+#   # Add error bars (95% CI)
			
 
				+#   geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0.2) +
			
 
				+#   # Customizing the shape of points
			
 
				+#   scale_shape_manual(
			
 
				+#     values = c(6, 6, 6, 2, 2, 2, 21, 21, 21),
			
 
				+#     labels = c("No chronic disease to One chronic disease", "No chronic disease to Comorbidity",
			
 
				+#                "One chronic disease to Comorbidity", "One chronic disease to No chronic disease", 
			
 
				+#                "Comorbidity to One chronic disease", "Comorbidity to No chronic disease", 
			
 
				+#                "No chronic disease to Death", "One chronic disease to Death", "Comorbidity to Death")
			
 
				+#   ) +
			
 
				+#   # Set the color palette for different traits
			
 
				+#   scale_color_manual(values = c("red", "blue", "green", "purple", "orange", "brown", "pink", "yellow", "gray")) +
			
 
				+#   # Add title and axis labels
			
 
				+#   labs(
			
 
				+#     title = "The impact results of covariates",
			
 
				+#     x = "Effect Size with 95% CI",
			
 
				+#     y = NULL  # No y-axis title, it's already covered by trait names
			
 
				+#   ) +
			
 
				+#   # Adjust the x-axis limits to control the effect size range
			
 
				+#   xlim(0.1, 6) +
			
 
				+#   # Customize text size and legend position
			
 
				+#   theme(
			
 
				+#     plot.title = element_text(size = 32, face = "bold"),  # Title font size and style
			
 
				+#     axis.title = element_text(size = 28),  # Axis title font size
			
 
				+#     axis.text = element_text(size = 28),  # Axis ticks font size
			
 
				+#     strip.text = element_text(size = 30),  # Facet label font size (if any)
			
 
				+#     legend.text = element_text(size = 24),  # Legend font size
			
 
				+#     legend.title = element_blank(),  # Remove legend title
			
 
				+#     legend.position = "top",  # Position of the legend
			
 
				+#     legend.key.size = unit(1.5, "lines"),  # Adjust the size of the legend keys (points)
			
 
				+#     panel.grid.major = element_line(size = 0.5, linetype = "dashed", color = "gray90"),
			
 
				+#     panel.grid.minor = element_blank(),  # Remove minor gridlines
			
 
				+#     panel.background = element_blank(),  # White background for panel
			
 
				+#     plot.background = element_blank()  # White background for plot
			
 
				+#   ) +
			
 
				+#   # Customize x-axis breaks
			
 
				+#   scale_x_continuous(
			
 
				+#     breaks = c(0.1, 0.5, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0)
			
 
				+#   )
			
 
				+
			
 
				+# # Print the plot
			
 
				+# print(p)
			
--- a/paper_code/vif.py
+++ b/paper_code/vif.py
@@ -0,0 +1,25 @@
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from statsmodels.stats.outliers_influence import variance_inflation_factor
			
 
				+
			
 
				+# 实际数据
			
 
				+df = pd.read_csv("paper_data.csv")
			
 
				+
			
 
				+df = df.drop(columns=["before_last_NO2", "before_last_O3", "last_year_pm10",
			
 
				+                        "before_last_pm1",	"before_last_pm2.5","last_year_pm1","last_year_OM","last_year_BC",
			
 
				+                    	'before_last_pm10',	'last_year_NO3',	'last_year_NH4',	'before_last_SO4',
			
 
				+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'before_last_nl'])
			
 
				+# 计算每个变量的 VIF方差膨胀因子
			
 
				+def calculate_vif(df):
			
 
				+    vif_data = pd.DataFrame()
			
 
				+    vif_data["Variable"] = df.columns
			
 
				+    vif_data["VIF"] = [
			
 
				+        variance_inflation_factor(df.values, i) for i in range(df.shape[1])
			
 
				+    ]
			
 
				+    return vif_data
			
 
				+
			
 
				+# 调用函数计算 VIF
			
 
				+vif_result = calculate_vif(df)
			
 
				+
			
 
				+# 输出 VIF 结果
			
 
				+print(vif_result)
			
--- a/test.py
+++ b/test.py
@@ -132,19 +132,43 @@ import pandas as pd
 
				 #     cognition["draw_score"]
			
 
				 # cognition.to_csv("/root/r_base/CHARLS/test.csv")
			
 
				 
			
 
				-import pandas as pd
			
 
				-CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
			
 
				-#合并
			
 
				-merge_list = ["marital_status_m",	"Height_m",	"Weight_m",	"waist_m",	"Systolic_m",	"Diastolic_m",
			
 
				-                "Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
			
 
				-                'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
			
 
				-                'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
			
 
				-                'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
			
 
				-
			
 
				-# 遍历 merge_list 列表
			
 
				-for col_m in merge_list:
			
 
				-    col = col_m.replace('_m', '')  # 去掉 '_m' 得到相应的列名
			
 
				-    if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
			
 
				-        CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
			
 
				-
			
 
				-CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h_test.csv")
			
 
				+# import pandas as pd
			
 
				+# CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
			
 
				+# #合并
			
 
				+# merge_list = ["marital_status_m",	"Height_m",	"Weight_m",	"waist_m",	"Systolic_m",	"Diastolic_m",
			
 
				+#                 "Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
			
 
				+#                 'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
			
 
				+#                 'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
			
 
				+#                 'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
			
 
				+
			
 
				+# # 遍历 merge_list 列表
			
 
				+# for col_m in merge_list:
			
 
				+#     col = col_m.replace('_m', '')  # 去掉 '_m' 得到相应的列名
			
 
				+#     if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
			
 
				+#         CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
			
 
				+
			
 
				+# CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h_test.csv")
			
 
				+
			
 
				+
			
 
				+import base64
			
 
				+
			
 
				+def audio_to_base64_string(file_path):
			
 
				+    # 打开语音文件并以二进制模式读取
			
 
				+    with open(file_path, "rb") as audio_file:
			
 
				+        audio_data = audio_file.read()
			
 
				+    
			
 
				+    # 对音频数据进行 Base64 编码
			
 
				+    encoded_audio = base64.b64encode(audio_data)
			
 
				+    
			
 
				+    # 将编码后的数据转换为字符串
			
 
				+    audio_string = encoded_audio.decode("utf-8")
			
 
				+    
			
 
				+    return audio_string
			
 
				+
			
 
				+# 示例调用
			
 
				+file_path = "bjh_rec_0a8ed71a266044c0b72f50c1d3d6be15.wav"  # 将此处替换为你的文件路径
			
 
				+audio_string = audio_to_base64_string(file_path)
			
 
				+
			
 
				+ # 将结果写入到文本文件中，以避免控制台显示乱码
			
 
				+with open("encoded_audio_string.txt", 'w') as output_file:
			
 
				+    output_file.write(audio_string)
			
--- a/time.md
+++ b/time.md
@@ -0,0 +1,69 @@
 
				+# 番茄时间

			
 
				+***

			
 
				+

			
 
				+5. 大模型面试50问

			
 
				+6. 自己动手学深度学习

			
 
				+

			
 
				+4. 介绍一下当前主流的大模型结构是如何组成的？

			
 
				+5. 介绍一下当前的GPT大模型的结构包括哪些部分？

			
 
				+6. 介绍下大模型的多头注意力机制

			
 
				+7. 什么是大模型MOE结构

			
 
				+8. 知识蒸馏的步骤是什么

			
 
				+9. 知识蒸馏中的教师模型和学生模型有什么区别

			
 
				+10. 什么是分组混合并行训练方法

			
 
				+11. 混合专家MOE的基本原理是什么

			
 
				+

			
 
				+

			
 
				+4. 写一篇关于大模型应用的论文

			
 
				+5. 看transformer代码

			
 
				+

			
 
				+

			
 
				+docker run -itd -p 17860:7860 -p 18022:22 -v /ai_home/zhaojingteng:/root/zhaojingteng --name=zjt_chat --gpus='"device=5,6,7"' nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04

			
 
				+

			
 
				+docker run -itd -p 8084:63000 -v /ai_home/zhongjiayi/gpu_server:/root/nlp_test --name=nlp_test

			
 
				+

			
 
				+docker run -itd -p 8063:22 -v /home/ubuntu/PaddleFaq/r_base:/root/r_base --security-opt seccomp：unconfined --name=r_base r-base-self:1.0

			
 
				+

			
 
				+

			
 
				+192.168.210.83

			
 
				+

			
 
				+255.255.255.0

			
 
				+# 备忘

			
 
				+---

			
 
				+飞天： 

			
 
				+111.198.66.100   shuju   Aa123Bb456

			
 
				+

			
 
				+101.200.228.213   LiYanpeng_DaoHang    thinkit123

			
 
				+

			
 
				+GPU

			
 
				+103.116.120.16  ubuntu  speech,123

			
 
				+

			
 
				+10.20.1.208 nlp speech,123

			
 
				+

			
 
				+kubectl get pods -n nlp

			
 
				+

			
 
				+kubectl exec -it nlp-3-86845b5f54-4t7ch -n nlp -- bash

			
 
				+

			
 
				+kubectl get svc -n nlp

			
 
				+

			
 
				+--查最高内存

			
 
				+ps auxw | head -1;ps auxw|sort -rn -k4|head -5

			
 
				+

			
 
				+https://agit.ai/ddx/TVBox/raw/branch/master/t4.json

			
 
				+

			
 
				+http://pandown.pro/tvbox/tvbox.json

			
 
				+

			
 
				+https://agit.ai/Yoursmile7/TVBox/raw/branch/master/live.txt

			
 
				+

			
 
				+

			
 
				+# 代理到本机的指定端口（公网服务器）

			
 
				+export https_proxy=http://10.126.126.3:12334;export http_proxy=http://10.126.126.3:12334;export all_proxy=socks5://10.126.126.3:12334

			
 
				+# 取消使用代理服务器

			
 
				+unset http_proxy;unset https_proxy;unset all_proxy

			
 
				+# 测试服务器是否可用

			
 
				+curl -v google.com

			
 
				+

			
 
				+[Service]

			
 
				+Environment="HTTP_PROXY=http://10.126.126.2:12334"

			
 
				+Environment="HTTPS_PROXY=http://10.126.126.2:12334"

			
 
				+Environment="NO_PROXY=localhost,127.0.0.1,.example.com"