|
@@ -0,0 +1,1078 @@
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+import pyreadstat
|
|
|
+from datetime import date
|
|
|
+from lunarcalendar import Converter, Lunar
|
|
|
+
|
|
|
+#统一列名
|
|
|
+def change_columns(df):
|
|
|
+ df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
|
|
|
+ "waist", "Systolic","Diastolic",
|
|
|
+
|
|
|
+ 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
|
|
|
+ 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
|
|
|
+
|
|
|
+ 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
|
|
|
+
|
|
|
+ 'Physical_activity',
|
|
|
+
|
|
|
+ 'Smoke','Drink',
|
|
|
+
|
|
|
+ "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
|
|
|
+ ]
|
|
|
+# 2020年把帕金森和记忆病症分开,需要和以前对齐
|
|
|
+def process_row(row):
|
|
|
+ da002_12_ = row['da003_12_']
|
|
|
+ da002_13_ = row['da003_13_']
|
|
|
+
|
|
|
+ if da002_12_ == 1 or da002_13_ == 1:
|
|
|
+ return 1
|
|
|
+ elif da002_12_ == 2 and da002_13_ == 2:
|
|
|
+ return 2
|
|
|
+ elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
|
|
|
+ return 2
|
|
|
+ elif pd.isna(da002_12_) and pd.isna(da002_13_):
|
|
|
+ return np.nan
|
|
|
+ else:
|
|
|
+ return np.nan # 预防万一,其余情况下设为NA
|
|
|
+
|
|
|
+def update_da051(value):
|
|
|
+ if value == 1:
|
|
|
+ return 3
|
|
|
+ elif value == 3:
|
|
|
+ return 1
|
|
|
+ else:
|
|
|
+ return value
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ # 2011年
|
|
|
+ year = "2011"
|
|
|
+ demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
|
|
|
+ psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
|
|
|
+ biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
|
|
|
+ blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
|
|
|
+ health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
|
|
|
+ health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
|
|
|
+ exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
|
|
|
+ #性别#年龄#居住地#婚姻状况
|
|
|
+ # 1 married or partnered
|
|
|
+ # 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
|
+ demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
|
|
|
+
|
|
|
+ #教育
|
|
|
+ # 0 not finish primary school/No formal education illiterate
|
|
|
+ # 1 finish primary school
|
|
|
+ demo["education"] = demo["bd001"].apply(lambda x : 1 if x in [3,4,5,6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
|
|
|
+
|
|
|
+ #居住地
|
|
|
+ data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
+
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ #腰围
|
|
|
+ biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ #血压测量后两次的平均
|
|
|
+ biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
|
|
|
+ biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
|
|
|
+ biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic']]
|
|
|
+ data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
|
|
|
+
|
|
|
+ #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
|
|
|
+ #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
|
|
|
+ blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
|
|
|
+ data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
|
|
|
+ # 慢性病:
|
|
|
+ # (1) Hypertension 高血压病
|
|
|
+ # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
|
|
|
+ # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
|
|
|
+ # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
|
|
|
+ # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
|
|
|
+ # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
|
|
|
+ # (除脂肪肝、肿瘤或癌外)
|
|
|
+ # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
|
|
|
+ # (8) Stroke 中风
|
|
|
+ # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
|
|
|
+ # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
|
|
|
+ # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
|
|
|
+ # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
|
|
|
+ # (13) Arthritis or rheumatism 关节炎或风湿病
|
|
|
+ # (14) Asthma 哮喘
|
|
|
+
|
|
|
+ # 体力活动
|
|
|
+ # 2 vigorous (vigorous activity more than once a week)
|
|
|
+ # 1 moderate (moderate activity more than once a week)
|
|
|
+ # 0 inactive (the rest)
|
|
|
+ health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
|
|
|
+ 1 if x["da051_2_"]==1 else
|
|
|
+ 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
|
|
|
+ else np.nan ,axis=1)
|
|
|
+ # 抽烟
|
|
|
+ # 1 抽过烟
|
|
|
+ # 0 没有抽过烟
|
|
|
+ health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
|
|
|
+
|
|
|
+ # 喝酒
|
|
|
+ # 1 喝过酒
|
|
|
+ # 0 没有喝过酒
|
|
|
+ health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
|
|
|
+ 0 if x["da069"] == 1 else
|
|
|
+ 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
|
|
|
+
|
|
|
+ health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
|
|
|
+ ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
|
|
|
+ ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"]]
|
|
|
+
|
|
|
+ data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
|
|
|
+
|
|
|
+
|
|
|
+ #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分
|
|
|
+ health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
|
|
|
+
|
|
|
+ #词语记忆
|
|
|
+ health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ #画图
|
|
|
+ health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+
|
|
|
+ data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
|
|
|
+ health_status["dc001s3_score"] + health_status["dc002_score"]+ \
|
|
|
+ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
|
|
|
+ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
|
|
|
+ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
|
|
|
+ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
|
|
|
+ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
|
|
|
+ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
|
|
|
+ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
|
|
|
+ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
|
|
|
+ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
|
|
|
+ health_status["draw_score"]
|
|
|
+ #心理得分
|
|
|
+ health_status["dc009_score"] = health_status["dc009"]-1
|
|
|
+ health_status["dc010_score"] = health_status["dc010"]-1
|
|
|
+ health_status["dc011_score"] = health_status["dc011"]-1
|
|
|
+ health_status["dc012_score"] = health_status["dc012"]-1
|
|
|
+ health_status["dc013_score"] = 4 - health_status["dc013"]
|
|
|
+ health_status["dc014_score"] = health_status["dc014"]-1
|
|
|
+ health_status["dc015_score"] = health_status["dc015"]-1
|
|
|
+ health_status["dc016_score"] = 4 - health_status["dc016"]
|
|
|
+ health_status["dc017_score"] = health_status["dc017"]-1
|
|
|
+ health_status["dc018_score"] = health_status["dc018"]-1
|
|
|
+ data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
|
|
|
+ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
|
|
|
+ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
|
|
|
+ #睡眠状态
|
|
|
+ # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
|
|
|
+ # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
|
|
|
+ # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
+ # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
+ data_2011["sleep_state"] = health_status['dc015']
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
+ data_2011["wave"] = year
|
|
|
+ change_columns(data_2011)
|
|
|
+ # 2011年的ID和其他年份有一点区别,倒数第三位加0
|
|
|
+ data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
|
|
|
+
|
|
|
+ # 2013年
|
|
|
+ year = "2013"
|
|
|
+ demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
|
|
|
+ psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
|
|
|
+ biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
|
|
|
+ health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
+ health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
+ exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
|
|
|
+
|
|
|
+ #性别#年龄#婚姻状况
|
|
|
+ # 1 married or partnered
|
|
|
+ # 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
|
+ demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
|
|
|
+
|
|
|
+ #教育
|
|
|
+ # 0 not finish primary school/No formal education illiterate
|
|
|
+ # 1 finish primary school
|
|
|
+
|
|
|
+ # 纠正2011年统计错误的教育
|
|
|
+ demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
|
|
|
+ demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+ education_correct = demo[['ID',"education_correct"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
|
|
|
+ # 删除多余的列
|
|
|
+ data_2011 = data_2011.drop(columns=['education_correct'])
|
|
|
+
|
|
|
+ #更新2013的教育
|
|
|
+ demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
|
|
|
+ demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+ #合并2011年的教育
|
|
|
+ eductaion_2011 = data_2011[['ID',"education"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
|
|
|
+
|
|
|
+ # 纠正2011年统计错误的出生年
|
|
|
+ demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
|
|
|
+ birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
|
|
|
+ data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
|
|
|
+ # 删除多余的列
|
|
|
+ data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
|
|
|
+ #合并2011年的出生年
|
|
|
+ birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
|
|
|
+ demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
|
|
|
+
|
|
|
+ #居住地
|
|
|
+ data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
+
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ #腰围
|
|
|
+ biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ #血压测量后两次的平均
|
|
|
+ biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
|
|
|
+ biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
|
|
|
+ biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic']]
|
|
|
+ data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
|
|
|
+
|
|
|
+ #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
|
|
|
+ #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
|
|
|
+ data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
|
|
|
+
|
|
|
+ # 慢性病:
|
|
|
+ # (1) Hypertension 高血压病
|
|
|
+ # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
|
|
|
+ # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
|
|
|
+ # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
|
|
|
+ # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
|
|
|
+ # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
|
|
|
+ # (除脂肪肝、肿瘤或癌外)
|
|
|
+ # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
|
|
|
+ # (8) Stroke 中风
|
|
|
+ # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
|
|
|
+ # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
|
|
|
+ # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
|
|
|
+ # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
|
|
|
+ # (13) Arthritis or rheumatism 关节炎或风湿病
|
|
|
+ # (14) Asthma 哮喘
|
|
|
+
|
|
|
+
|
|
|
+ # 体力活动
|
|
|
+ # 2 vigorous (vigorous activity more than once a week)
|
|
|
+ # 1 moderate (moderate activity more than once a week)
|
|
|
+ # 0 inactive (the rest)
|
|
|
+ health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
|
|
|
+ 1 if x["da051_2_"]==1 else
|
|
|
+ 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
|
|
|
+ else np.nan ,axis=1)
|
|
|
+
|
|
|
+ # 抽烟
|
|
|
+ # 1 抽过烟
|
|
|
+ # 0 没有抽过烟
|
|
|
+ health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
|
|
|
+
|
|
|
+ # 喝酒
|
|
|
+ # 1 喝过酒
|
|
|
+ # 0 没有喝过酒
|
|
|
+ health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
|
|
|
+ 0 if x["da069"] == 1 else
|
|
|
+ 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
|
|
|
+
|
|
|
+ # 合并2011年的慢性病
|
|
|
+ columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
|
|
|
+ ,'da007_12_','da007_13_','da007_14_']
|
|
|
+ columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
|
|
|
+ for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
|
|
|
+ health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
|
|
|
+
|
|
|
+ diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
|
|
|
+
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ for col in columns_to_diseases_new:
|
|
|
+ health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
|
|
|
+
|
|
|
+ health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
|
|
|
+
|
|
|
+ data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
|
|
|
+
|
|
|
+ #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
|
|
|
+ health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
|
|
|
+
|
|
|
+ #词语记忆
|
|
|
+ health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ #画图
|
|
|
+ health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+
|
|
|
+ data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
|
|
|
+ health_status["dc001s3_score"] + health_status["dc002_score"]+ \
|
|
|
+ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
|
|
|
+ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
|
|
|
+ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
|
|
|
+ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
|
|
|
+ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
|
|
|
+ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
|
|
|
+ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
|
|
|
+ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
|
|
|
+ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
|
|
|
+ health_status["draw_score"]
|
|
|
+ #心理得分
|
|
|
+ health_status["dc009_score"] = health_status["dc009"]-1
|
|
|
+ health_status["dc010_score"] = health_status["dc010"]-1
|
|
|
+ health_status["dc011_score"] = health_status["dc011"]-1
|
|
|
+ health_status["dc012_score"] = health_status["dc012"]-1
|
|
|
+ health_status["dc013_score"] = 4 - health_status["dc013"]
|
|
|
+ health_status["dc014_score"] = health_status["dc014"]-1
|
|
|
+ health_status["dc015_score"] = health_status["dc015"]-1
|
|
|
+ health_status["dc016_score"] = 4 - health_status["dc016"]
|
|
|
+ health_status["dc017_score"] = health_status["dc017"]-1
|
|
|
+ health_status["dc018_score"] = health_status["dc018"]-1
|
|
|
+ data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
|
|
|
+ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
|
|
|
+ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
|
|
|
+
|
|
|
+ #睡眠状态
|
|
|
+ # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
|
|
|
+ # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
|
|
|
+ # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
+ # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
+ data_2013["sleep_state"] = health_status['dc015']
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
+ data_2013["wave"] = year
|
|
|
+ change_columns(data_2013)
|
|
|
+ data_2013 = pd.concat([data_2011, data_2013], axis=0)
|
|
|
+
|
|
|
+ # 2015年
|
|
|
+ year = "2015"
|
|
|
+ demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
|
|
|
+ psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
|
|
|
+ blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
|
|
|
+ biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
|
|
|
+ health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
+ health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
+
|
|
|
+ #性别#年龄#婚姻状况
|
|
|
+ # 1 married or partnered
|
|
|
+ # 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
|
+ demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
|
|
|
+
|
|
|
+ #教育
|
|
|
+ # 0 not finish primary school/No formal education illiterate
|
|
|
+ # 1 finish primary school
|
|
|
+ #更新2015的教育
|
|
|
+ demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
|
|
|
+ demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+ #合并2013年的教育
|
|
|
+ eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
|
|
|
+
|
|
|
+ # 2015年的出生年
|
|
|
+ demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
+
|
|
|
+ #居住地
|
|
|
+ data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
+
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ #腰围
|
|
|
+ biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
|
|
|
+ #血压测量后两次的平均
|
|
|
+ biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
|
|
|
+ biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
|
|
|
+ biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
|
|
|
+ biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic']]
|
|
|
+ data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
|
|
|
+
|
|
|
+ #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
|
|
|
+ #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
|
|
|
+ blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
|
|
|
+ data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
|
|
|
+
|
|
|
+ # 慢性病:
|
|
|
+ # (1) Hypertension 高血压病
|
|
|
+ # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
|
|
|
+ # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
|
|
|
+ # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
|
|
|
+ # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
|
|
|
+ # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
|
|
|
+ # (除脂肪肝、肿瘤或癌外)
|
|
|
+ # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
|
|
|
+ # (8) Stroke 中风
|
|
|
+ # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
|
|
|
+ # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
|
|
|
+ # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
|
|
|
+ # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
|
|
|
+ # (13) Arthritis or rheumatism 关节炎或风湿病
|
|
|
+ # (14) Asthma 哮喘
|
|
|
+
|
|
|
+ # 体力活动
|
|
|
+ # 2 vigorous (vigorous activity more than once a week)
|
|
|
+ # 1 moderate (moderate activity more than once a week)
|
|
|
+ # 0 inactive (the rest)
|
|
|
+ health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
|
|
|
+ 1 if x["da051_2_"]==1 else
|
|
|
+ 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
|
|
|
+ else np.nan ,axis=1)
|
|
|
+
|
|
|
+ # 抽烟
|
|
|
+ # 1 抽过烟
|
|
|
+ # 0 没有抽过烟
|
|
|
+ health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
|
|
|
+
|
|
|
+ # 喝酒
|
|
|
+ # 1 喝过酒
|
|
|
+ # 0 没有喝过酒
|
|
|
+ health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
|
|
|
+ 0 if x["da069"] == 1 else
|
|
|
+ 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
|
|
|
+
|
|
|
+ # 合并2013年的慢性病
|
|
|
+ columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
|
|
|
+ ,'da007_12_','da007_13_','da007_14_']
|
|
|
+ columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
|
|
|
+ for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
|
|
|
+ health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
|
|
|
+
|
|
|
+ diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
|
|
|
+
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ for col in columns_to_diseases_new:
|
|
|
+ health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
|
|
|
+
|
|
|
+ health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
|
|
|
+
|
|
|
+ data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
|
|
|
+
|
|
|
+ #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
|
|
|
+ health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
|
|
|
+
|
|
|
+ #词语记忆
|
|
|
+ health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
|
|
|
+ health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
|
|
|
+ # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ #画图
|
|
|
+ health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+
|
|
|
+ data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
|
|
|
+ health_status["dc001s3_score"] + health_status["dc002_score"]+ \
|
|
|
+ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
|
|
|
+ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
|
|
|
+ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
|
|
|
+ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
|
|
|
+ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
|
|
|
+ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
|
|
|
+ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
|
|
|
+ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
|
|
|
+ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
|
|
|
+ health_status["draw_score"]
|
|
|
+ #心理得分
|
|
|
+ health_status["dc009_score"] = health_status["dc009"]-1
|
|
|
+ health_status["dc010_score"] = health_status["dc010"]-1
|
|
|
+ health_status["dc011_score"] = health_status["dc011"]-1
|
|
|
+ health_status["dc012_score"] = health_status["dc012"]-1
|
|
|
+ health_status["dc013_score"] = 4 - health_status["dc013"]
|
|
|
+ health_status["dc014_score"] = health_status["dc014"]-1
|
|
|
+ health_status["dc015_score"] = health_status["dc015"]-1
|
|
|
+ health_status["dc016_score"] = 4 - health_status["dc016"]
|
|
|
+ health_status["dc017_score"] = health_status["dc017"]-1
|
|
|
+ health_status["dc018_score"] = health_status["dc018"]-1
|
|
|
+ data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
|
|
|
+ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
|
|
|
+ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
|
|
|
+ #睡眠状态
|
|
|
+ # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
|
|
|
+ # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
|
|
|
+ # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
+ # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
+ data_2015["sleep_state"] = health_status['dc015']
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
+ data_2015["wave"] = year
|
|
|
+ change_columns(data_2015)
|
|
|
+ data_2015 = pd.concat([data_2013, data_2015], axis=0)
|
|
|
+
|
|
|
+ # 2018年
|
|
|
+ year = "2018"
|
|
|
+ demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
|
|
|
+ psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
|
|
|
+ health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
+ health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
+ cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
+
|
|
|
+ #性别#年龄#婚姻状况
|
|
|
+ # 1 married or partnered
|
|
|
+ # 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
|
+ demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
|
|
|
+
|
|
|
+ #教育
|
|
|
+ # 0 not finish primary school/No formal education illiterate
|
|
|
+ # 1 finish primary school
|
|
|
+ #更新2015的教育
|
|
|
+ demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
|
|
|
+ demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+
|
|
|
+ # 出生年
|
|
|
+ demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
+
|
|
|
+ #居住地
|
|
|
+ data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
+
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
|
|
|
+
|
|
|
+ #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
|
|
|
+ #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
|
|
|
+ data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
|
|
|
+
|
|
|
+ # 慢性病:
|
|
|
+ # (1) Hypertension 高血压病
|
|
|
+ # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
|
|
|
+ # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
|
|
|
+ # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
|
|
|
+ # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
|
|
|
+ # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
|
|
|
+ # (除脂肪肝、肿瘤或癌外)
|
|
|
+ # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
|
|
|
+ # (8) Stroke 中风
|
|
|
+ # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
|
|
|
+ # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
|
|
|
+ # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
|
|
|
+ # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
|
|
|
+ # (13) Arthritis or rheumatism 关节炎或风湿病
|
|
|
+ # (14) Asthma 哮喘
|
|
|
+
|
|
|
+ # 体力活动
|
|
|
+ # 2 vigorous (vigorous activity more than once a week)
|
|
|
+ # 1 moderate (moderate activity more than once a week)
|
|
|
+ # 0 inactive (the rest)
|
|
|
+ health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
|
|
|
+ 1 if x["da051_2_"]==1 else
|
|
|
+ 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
|
|
|
+ else np.nan ,axis=1)
|
|
|
+
|
|
|
+ # 抽烟
|
|
|
+ # 1 抽过烟
|
|
|
+ # 0 没有抽过烟
|
|
|
+ health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
|
|
|
+
|
|
|
+ # 喝酒
|
|
|
+ # 1 喝过酒
|
|
|
+ # 0 没有喝过酒
|
|
|
+ health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
|
|
|
+ 0 if x["da069"] == 1 else
|
|
|
+ 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
|
|
|
+
|
|
|
+ columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
|
|
|
+ ,'da007_12_','da007_13_','da007_14_']
|
|
|
+ columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
|
|
|
+ for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
|
|
|
+ health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
|
|
|
+
|
|
|
+ diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
|
|
|
+
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ for col in columns_to_diseases_new:
|
|
|
+ health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
|
|
|
+
|
|
|
+ health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
|
|
|
+
|
|
|
+ data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
|
|
|
+
|
|
|
+ #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
|
|
|
+ cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+ cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+ cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+ cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+ # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+ cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
|
|
|
+ cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
|
|
|
+ cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
|
|
|
+ cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
|
|
|
+ cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
|
|
|
+
|
|
|
+ #词语记忆
|
|
|
+ cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
|
|
|
+ cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
|
|
|
+ cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
|
|
|
+ cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
|
|
|
+ cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
|
|
|
+ cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
|
|
|
+ cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
|
|
|
+ cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
|
|
|
+ cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
|
|
|
+ cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
|
|
|
+ # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
|
|
|
+ cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
|
|
|
+ cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
|
|
|
+ cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
|
|
|
+ cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
|
|
|
+ cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
|
|
|
+ cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
|
|
|
+ cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
|
|
|
+ cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
|
|
|
+ cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
|
|
|
+ # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
|
|
|
+ #画图
|
|
|
+ cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
|
|
|
+
|
|
|
+ data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
|
|
|
+ cognition["dc001s3_score"] + cognition["dc002_score"]+ \
|
|
|
+ cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
|
|
|
+ cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
|
|
|
+ cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
|
|
|
+ cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
|
|
|
+ cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
|
|
|
+ cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
|
|
|
+ cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
|
|
|
+ cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
|
|
|
+ cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
|
|
|
+ cognition["draw_score"]
|
|
|
+ #心理得分
|
|
|
+ cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
|
|
|
+ cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
|
|
|
+ cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
|
|
|
+ #睡眠状态
|
|
|
+ # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
|
|
|
+ # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
|
|
|
+ # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
+ # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
+ data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
+ data_2018["wave"] = year
|
|
|
+ change_columns(data_2018)
|
|
|
+ data_2018 = pd.concat([data_2015, data_2018], axis=0)
|
|
|
+
|
|
|
+ # 2020年
|
|
|
+ year = "2020"
|
|
|
+ demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
|
|
|
+ psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
|
|
|
+ health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
+
|
|
|
+ #性别#年龄#婚姻状况
|
|
|
+ # 1 married or partnered
|
|
|
+ # 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
|
+ demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
|
|
|
+
|
|
|
+ #教育
|
|
|
+ # 0 not finish primary school/No formal education illiterate
|
|
|
+ # 1 finish primary school
|
|
|
+ demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
|
|
|
+ demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
|
|
|
+ #合并2018年的教育
|
|
|
+ eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
|
|
|
+
|
|
|
+ # 出生年
|
|
|
+ demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
|
|
|
+ #合并2018年的出生年
|
|
|
+ birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
|
|
|
+ demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ demo["ba003"] = 1
|
|
|
+
|
|
|
+ data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
+ #居住地
|
|
|
+ data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
+
|
|
|
+ #身高#体重#收缩压#舒张压
|
|
|
+ data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
|
|
|
+
|
|
|
+ #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
|
|
|
+ #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
|
|
|
+ data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
|
|
|
+
|
|
|
+ # 慢性病:
|
|
|
+ # (1) Hypertension 高血压病
|
|
|
+ # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
|
|
|
+ # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
|
|
|
+ # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
|
|
|
+ # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
|
|
|
+ # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
|
|
|
+ # (除脂肪肝、肿瘤或癌外)
|
|
|
+ # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
|
|
|
+ # (8) Stroke 中风
|
|
|
+ # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
|
|
|
+ # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
|
|
|
+ # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
|
|
|
+ # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
|
|
|
+ # (13) Arthritis or rheumatism 关节炎或风湿病
|
|
|
+ # (14) Asthma 哮喘
|
|
|
+ # 2020年把帕金森和记忆病症分开,需要和以前对齐
|
|
|
+
|
|
|
+ # 体力活动
|
|
|
+ # 2 vigorous (vigorous activity more than once a week)
|
|
|
+ # 1 moderate (moderate activity more than once a week)
|
|
|
+ # 0 inactive (the rest)
|
|
|
+ health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else
|
|
|
+ 1 if x["da032_2_"]==1 else
|
|
|
+ 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2)
|
|
|
+ else np.nan ,axis=1)
|
|
|
+
|
|
|
+ # 抽烟
|
|
|
+ # 1 抽过烟
|
|
|
+ # 0 没有抽过烟
|
|
|
+ health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
|
|
|
+
|
|
|
+ # 喝酒
|
|
|
+ # 1 喝过酒
|
|
|
+ # 0 没有喝过酒
|
|
|
+ health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else
|
|
|
+ 0 if x["da051"] == 3 else np.nan, axis=1)
|
|
|
+
|
|
|
+ health_status['da003_12_'] = health_status.apply(process_row, axis=1)
|
|
|
+
|
|
|
+ columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
|
|
|
+ ,'da003_12_','da003_14_','da003_15_']
|
|
|
+ columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
|
|
|
+ for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
|
|
|
+ health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
|
|
|
+
|
|
|
+ diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
|
|
|
+
|
|
|
+ # 按 'ID' 列合并两个表
|
|
|
+ health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
|
|
|
+ # 使用 fillna() 来更新字段
|
|
|
+ for col in columns_to_diseases_new:
|
|
|
+ health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
|
|
|
+
|
|
|
+ health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
|
|
|
+ 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
|
|
|
+ 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
|
|
|
+
|
|
|
+ data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
|
|
|
+
|
|
|
+ #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
|
|
|
+ health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+ health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+ health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+ health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+ health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+ health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1)
|
|
|
+ health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1)
|
|
|
+ health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
|
|
|
+ health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
|
|
|
+ health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
|
|
|
+
|
|
|
+ #词语记忆
|
|
|
+ health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
|
|
|
+ health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
|
|
|
+ health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
|
|
|
+ health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1)
|
|
|
+ health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1)
|
|
|
+ health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)
|
|
|
+ health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1)
|
|
|
+ health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1)
|
|
|
+ health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)
|
|
|
+ health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)
|
|
|
+ health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1)
|
|
|
+ health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1)
|
|
|
+ health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1)
|
|
|
+ health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1)
|
|
|
+ health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1)
|
|
|
+ health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)
|
|
|
+ health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1)
|
|
|
+ health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1)
|
|
|
+ health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)
|
|
|
+ health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)
|
|
|
+ #画图
|
|
|
+ health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
|
|
|
+
|
|
|
+ data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
|
|
|
+ health_status["dc001s3_score"] + health_status["dc002_score"]+ \
|
|
|
+ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
|
|
|
+ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
|
|
|
+ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
|
|
|
+ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
|
|
|
+ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
|
|
|
+ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
|
|
|
+ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
|
|
|
+ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
|
|
|
+ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
|
|
|
+ health_status["draw_score"]
|
|
|
+ #心理得分
|
|
|
+ health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
|
|
|
+ data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
|
|
|
+ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
|
|
|
+ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
|
|
|
+
|
|
|
+ #睡眠状态
|
|
|
+ # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
|
|
|
+ # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
|
|
|
+ # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
+ # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
+ data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
+ data_2020["wave"] = year
|
|
|
+ change_columns(data_2020)
|
|
|
+ data_2020 = pd.concat([data_2018, data_2020], axis=0)
|
|
|
+
|
|
|
+ #修改地区名称
|
|
|
+ #省份、城市名称和污染物数据格式对齐
|
|
|
+ #海东地区->海东市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
|
|
|
+ #北京 -> 北京市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('北京', '北京市')
|
|
|
+ data_2020['province'] = data_2020['province'].replace('北京', '北京市')
|
|
|
+ #哈尔滨 -> 哈尔滨市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
|
|
|
+ #天津 -> 天津市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('天津', '天津市')
|
|
|
+ data_2020['province'] = data_2020['province'].replace('天津', '天津市')
|
|
|
+ #广西省 -> 广西壮族自治区
|
|
|
+ data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
|
|
|
+ #巢湖市 -> 合肥市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
|
|
|
+ #襄樊市->襄阳市
|
|
|
+ data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
|
|
|
+
|
|
|
+ data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
|
|
|
+ print(123)
|