import pandas as pd import numpy as np import pyreadstat from datetime import date from lunarcalendar import Converter, Lunar #统一列名 def change_columns(df): df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"urban_nbs","Height", "Weight", "waist", "Systolic","Diastolic", "Sit_Stand_5x", "Walking_Speed_Time", 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', 'Physical_activity', 'Smoke','Drink', "Accident_Or_Injury","Fell_In_Last2Years", "Wear_Glasses" , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "Interacted_With_Friends", "Played_Ma_jong", "Provided_help", "Sport", "Community_Related_Organization", "Charity_work", "Training_course", "Other", "None" , "Internet_Usage_LastMonth", "Drink_PastYear", "Cognition_score", "Psychiatric_score","sleep_state", "ADL", 'Gas_Connection','Heating_Facility', 'Heating_Energy', 'Cooking_Fuel', "wave", ] # 2020年把帕金森和记忆病症分开,需要和以前对齐 def process_row(row): da002_12_ = row['da003_12_'] da002_13_ = row['da003_13_'] if da002_12_ == 1 or da002_13_ == 1: return 1 elif da002_12_ == 2 and da002_13_ == 2: return 2 elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2): return 2 elif pd.isna(da002_12_) and pd.isna(da002_13_): return np.nan else: return np.nan # 预防万一,其余情况下设为NA def update_da051(value): if value == 1: return 3 elif value == 3: return 1 else: return value if __name__ == "__main__": # 2011年 year = "2011" demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta") psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk') biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta") blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta") health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta") health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta") exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta") weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta") houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/housing_characteristics.dta") #性别#年龄#居住地#婚姻状况 # 1 married or partnered # 0 other marital status (separated, divorced, unmarried, or widowed) demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1) #教育 # 0 below high school # 1 high school # 2 college or above demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) #获取随访时间 demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left") data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']] #居住地 # 0 农村 # 1 城市 data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left") #身高#体重#收缩压#舒张压 biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) #腰围 biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) #血压测量后两次的平均 biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2 biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2 #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗 # 1 yes # 0 no biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan) # 步行速度时间 biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2 biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]] data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left") #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])] data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left") # 慢性病: # (1) Hypertension 高血压病 # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降) # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高) # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌) # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌) # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病 # (除脂肪肝、肿瘤或癌外) # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病) # (8) Stroke 中风 # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌) # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌) # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题 # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症) # (13) Arthritis or rheumatism 关节炎或风湿病 # (14) Asthma 哮喘 # 体力活动 # 2 vigorous (vigorous activity more than once a week) # 1 moderate (moderate activity more than once a week) # 0 inactive (the rest) health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 1 if x["da051_2_"]==1 else 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) else np.nan ,axis=1) # 抽烟 # 1 抽过烟 # 0 没有抽过烟 health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 喝酒 # 1 喝过酒 # 0 没有喝过酒 health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 0 if x["da069"] == 1 else 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1) # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗? # 1 是 # 0 否 health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 过去两年有没有摔倒? # 1 是 # 0 否 health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # # 您什么时候开始来月经的?(year/age) # health_status['Menarche_Year']=health_status["da026_1"] # health_status['Menarche_Age']=health_status["da026_2"] # # 您什么时候开始绝经的? # health_status['Menopause_Year']=health_status["da028_1"] # health_status['Menopause_Age']=health_status["da028_2"] # # 第一次诊断出您有前列腺疾病是在什么时候? # health_status['Prostate_Issue_Year']=health_status["da030_1"] # health_status['Prostate_Issue_Age']=health_status["da030_2"] # 是否戴眼镜(包括矫正视力镜片)? # 1 是 # 0 否 # 2 失明 # 3 偶尔 health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else np.nan) # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间) health_status['Average_Sleep_Hours']=health_status["da049"] # 过去一个月内,您通常午睡多长时间?分钟 health_status['Average_Nap_Minutes']=health_status["da050"] # 您通常每周有没有至少持续做激烈活动十分钟? health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续做中等强度的体力活动十分钟? health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续走路十分钟? health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有多少天做[激烈活动]至少十分钟? health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1) # 您通常每周有多少天做[中等强度的体力活动]至少十分钟? health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1) # 您通常每周有多少天做[走路]至少十分钟? health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1) # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时 # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时 # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时 # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟 # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟 # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟 # health_status['Walking_30Min_PerDay']=health_status["da054_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时 # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时 # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时 # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"] # 活动的原因 # 1 工作需要 # 2 娱乐 # 3 体育锻炼 # 4 其他 # health_status[["Reason_For_Vigorous_Activity"]]= np.nan # health_status[["Reason_For_Moderate_Effort"]]= np.nan # health_status[["Reason_For_Walking"]]= np.nan # 过去一个月是否进行了下列社交活动? # (1) 串门、跟朋友交往 # (2) 打麻将、下棋、打牌、去社区活动室 # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助 # (4) 去公园或者其他场所跳舞、健身、练气功等 # (5) 参加社团组织活动 # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人 # (7) 上学或者参加培训课程 # (8)其他 # (9) 以上均没有 health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1) health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1) health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1) health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1) health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1) health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1) health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1) health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1) health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1) # 过去一个月的活动频率 # (1) Almost daily 差不多每天 # (2) Almost every week 差不多每周 # (3) Not regularly 不经常 # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1) # health_status["da057_7_"] = health_status["da057_8_"] # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1 # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2 # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3 # else np.nan, axis=1) # 过去一个月,您是否上网? # 1 是 # 0 否 health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0) # # 使用以下哪些工具上网? # health_status[["Internet_Tools_Desktop_computer"]] = np.nan # health_status[["Internet_Tools_Laptop_computer"]] = np.nan # health_status[["Internet_Tools_Tablet_computer"]] = np.nan # health_status[["Internet_Tools_Cellphone"]] = np.nan # health_status[["Internet_Tools_Other"]] = np.nan # # 上网一般做什么? # health_status[["Internet_Purpose_Chat"]] = np.nan # health_status[["Internet_Purpose_news"]] = np.nan # health_status[["Internet_Purpose_videos"]] = np.nan # health_status[["Internet_Purpose_games"]] = np.nan # health_status[["Internet_Purpose_Financial"]] = np.nan # health_status[["Internet_Purpose_Others"]] = np.nan # # 是否会用手机支付 # health_status[["Mobile_Payment"]] = np.nan # # 是否使用微信? # health_status[["Wechat_Usage"]] = np.nan # # 发不发微信朋友圈? # health_status[["Post_Moments"]] = np.nan # # 现在还在吸烟还是戒烟了? # # 1 仍然抽烟 Skip DA062 请跳过DA062 # # 2 戒烟 # health_status['Current_Smoking_Status']=health_status["da061"] # 吸烟时,一般抽什么烟? # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟) # (2) Smoking self-rolled cigarettes 自己卷烟抽 # (3) Filtered cigarette带滤咀香烟 # (4) Unfiltered cigarette不带滤咀香烟 # (5) Cigar雪茄 # (6) Water cigarettes 水烟 # health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1 # health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2 # health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3 # health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4 # health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5 # health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6 # 现在/戒烟前平均一天抽多少支香烟? # health_status['Daily_Cigarette_Count']=health_status["da063"] # 在过去的一年, 喝酒吗 # (1) Drink more than once a month. 喝酒,每月超过一次 # (2) Drink but less than once a month 喝酒,但每月少于一次 # (3) None of these 什么都不喝 health_status['Drink_PastYear']=health_status["da067"] # 过去一年内 平均一个月喝几次酒 # (1)Once a month 每月一次 # (2)2-3 times a month 每月2-3次 # (3)Once a week 每周一次 # (4)2-3 times a week 每周2-3次 # (5)4-6 times a week 每周4-6次 # (6)Once a day 每天一次 # (7)Twice a day 一天两次 # (8)More than twice a day 一天超过两次 # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1) health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_' ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_' ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink" , "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9" , "Internet_Usage_LastMonth", "Drink_PastYear"]] data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left") # 自上次访问以来的两年内,您是否发作过心脏病? # 1 是 # 0 否 data_2011[['Heart_attack_2_years']]=np.nan # 自上次访问以来,是否有医生诊断您中风复发? # 1 是 # 0 否 data_2011[['Recurrent_Stroke']]=np.nan #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分 health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0) health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0) health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0) #词语记忆 health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) #画图 health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \ health_status["dc001s3_score"] + health_status["dc002_score"]+ \ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\ health_status["draw_score"] #心理得分 health_status["dc009_score"] = health_status["dc009"]-1 health_status["dc010_score"] = health_status["dc010"]-1 health_status["dc011_score"] = health_status["dc011"]-1 health_status["dc012_score"] = health_status["dc012"]-1 health_status["dc013_score"] = 4 - health_status["dc013"] health_status["dc014_score"] = health_status["dc014"]-1 health_status["dc015_score"] = health_status["dc015"]-1 health_status["dc016_score"] = 4 - health_status["dc016"] health_status["dc017_score"] = health_status["dc017"]-1 health_status["dc018_score"] = health_status["dc018"]-1 data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"] #睡眠状态 # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天) # (2)Some or a little of the time (1-2 days) 不太多(1-2天) # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) data_2011["sleep_state"] = health_status['dc015'] #ADL health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \ health_status["db014_score"] + health_status["db015_score"] # 是否有管道煤气或天然气? houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 是否带供暖设施(不包括土暖气和可制暖的空调)? houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 供暖所用的主要能源是什么? # (1)Solar 太阳能 # (2)Coal 煤炭、蜂窝煤 # (3)Natural gas 管道天然气或煤气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)Crop residue/Wood buring 秸秆、柴火 # (7)Other 其他 houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) # 做饭用的主要燃料是什么? # (1)Coal 煤炭、蜂窝煤 # (2)Natural gas 管道天然气或煤气 # (3)Marsh gas 沼气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)crop residue/Wood burning 秸秆、柴火 # (7)other 其他 houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) houseing_select = houseing[['householdID', 'communityID','Gas_Connection', 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']] data_2011 = pd.merge(data_2011, houseing_select, on = ['householdID', 'communityID'], how="left") data_2011["wave"] = year change_columns(data_2011) # 2011年的ID和其他年份有一点区别,倒数第三位加0 data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x) print("2011 complete") # 2013年 year = "2013" demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta") psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk') biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta") health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta") health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta") exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta") weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta") houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta") #性别#年龄#婚姻状况 # 1 married or partnered # 0 other marital status (separated, divorced, unmarried, or widowed) demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1) #教育 # 0 below high school # 1 high school # 2 college or above # 纠正2011年统计错误的教育 demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1) demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) education_correct = demo[['ID',"education_correct"]] # 按 'ID' 列合并两个表 data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left') # 使用 fillna() 来更新字段 data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education']) # 删除多余的列 data_2011 = data_2011.drop(columns=['education_correct']) #更新2013的教育 demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1) demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) #合并2011年的教育 eductaion_2011 = data_2011[['ID',"education"]] # 按 'ID' 列合并两个表 demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011")) # 使用 fillna() 来更新字段 demo['education'] = demo['education_2013'].fillna(demo['education_2011']) # 纠正2011年统计错误的出生年 demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1) demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1) birth_year_2013 = demo[['ID',"birth_year", "birth_month"]] # 按 'ID' 列合并两个表 data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013")) # 使用 fillna() 来更新字段 data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011']) data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011']) # 删除多余的列 data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011']) #合并2011年的出生年 birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]] # 按 'ID' 列合并两个表 demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011")) # 使用 fillna() 来更新字段 demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011']) demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011']) #获取随访时间 demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left") data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]] #居住地 # 0 农村 # 1 城市 data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left") #身高#体重#收缩压#舒张压 biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) #腰围 biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) #血压测量后两次的平均 biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2 biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2 #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗 # 1 yes # 0 no biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan) # 步行速度时间 biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2 biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]] data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left") #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan # 慢性病: # (1) Hypertension 高血压病 # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降) # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高) # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌) # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌) # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病 # (除脂肪肝、肿瘤或癌外) # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病) # (8) Stroke 中风 # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌) # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌) # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题 # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症) # (13) Arthritis or rheumatism 关节炎或风湿病 # (14) Asthma 哮喘 # 体力活动 # 2 vigorous (vigorous activity more than once a week) # 1 moderate (moderate activity more than once a week) # 0 inactive (the rest) health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 1 if x["da051_2_"]==1 else 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) else np.nan ,axis=1) # 抽烟 # 1 抽过烟 # 0 没有抽过烟 health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1) # 喝酒 # 1 喝过酒 # 0 没有喝过酒 health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 0 if x["da069"] == 1 else 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1) # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗? # 1 是 # 0 否 health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 过去两年有没有摔倒? # 1 是 # 0 否 health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # # 您什么时候开始来月经的?(year/age) # health_status['Menarche_Year']=health_status["da026_1"] # health_status['Menarche_Age']=health_status["da026_2"] # # 您什么时候开始绝经的? # health_status['Menopause_Year']=health_status["da028_1"] # health_status['Menopause_Age']=health_status["da028_2"] # # 第一次诊断出您有前列腺疾病是在什么时候? # health_status['Prostate_Issue_Year']=health_status["da030_1"] # health_status['Prostate_Issue_Age']=health_status["da030_2"] # 是否戴眼镜(包括矫正视力镜片)? # 1 是 # 0 否 # 2 失明 # 3 偶尔 health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan) # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间) health_status['Average_Sleep_Hours']=health_status["da049"] # 过去一个月内,您通常午睡多长时间?分钟 health_status['Average_Nap_Minutes']=health_status["da050"] # 您通常每周有没有至少持续做激烈活动十分钟? health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续做中等强度的体力活动十分钟? health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续走路十分钟? health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有多少天做[激烈活动]至少十分钟? health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1) # 您通常每周有多少天做[中等强度的体力活动]至少十分钟? health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1) # 您通常每周有多少天做[走路]至少十分钟? health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1) # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时 # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时 # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时 # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟 # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟 # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟 # health_status['Walking_30Min_PerDay']=health_status["da054_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时 # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时 # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时 # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"] # 活动的原因 # 1 工作需要 # 2 娱乐 # 3 体育锻炼 # 4 其他 # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"] # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"] # health_status["Reason_For_Walking"]= health_status["da051_1_3_"] # 过去一个月是否进行了下列社交活动? # (1) 串门、跟朋友交往 # (2) 打麻将、下棋、打牌、去社区活动室 # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助 # (4) 去公园或者其他场所跳舞、健身、练气功等 # (5) 参加社团组织活动 # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人 # (7) 上学或者参加培训课程 # (8)其他 # (9) 以上均没有 health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1) health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1) health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1) health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1) health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1) health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1) health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1) health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1) health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1) # 过去一个月的活动频率 # (1) Almost daily 差不多每天 # (2) Almost every week 差不多每周 # (3) Not regularly 不经常 # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1) # health_status["da057_7_"] = health_status["da057_8_"] # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1 # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2 # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3 # else np.nan, axis=1) # 过去一个月,您是否上网? health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0) # # 使用以下哪些工具上网? # health_status[["Internet_Tools_Desktop_computer"]] = np.nan # health_status[["Internet_Tools_Laptop_computer"]] = np.nan # health_status[["Internet_Tools_Tablet_computer"]] = np.nan # health_status[["Internet_Tools_Cellphone"]] = np.nan # health_status[["Internet_Tools_Other"]] = np.nan # # 上网一般做什么? # health_status[["Internet_Purpose_Chat"]] = np.nan # health_status[["Internet_Purpose_news"]] = np.nan # health_status[["Internet_Purpose_videos"]] = np.nan # health_status[["Internet_Purpose_games"]] = np.nan # health_status[["Internet_Purpose_Financial"]] = np.nan # health_status[["Internet_Purpose_Others"]] = np.nan # # 是否会用手机支付 # health_status[["Mobile_Payment"]] = np.nan # # 是否使用微信? # health_status[["Wechat_Usage"]] = np.nan # # 发不发微信朋友圈? # health_status[["Post_Moments"]] = np.nan # # 现在还在吸烟还是戒烟了? # # 1 仍然抽烟 Skip DA062 请跳过DA062 # # 2 戒烟 # health_status['Current_Smoking_Status']=health_status["da061"] # # 吸烟时,一般抽什么烟? # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟) # # (2) Smoking self-rolled cigarettes 自己卷烟抽 # # (3) Filtered cigarette带滤咀香烟 # # (4) Unfiltered cigarette不带滤咀香烟 # # (5) Cigar雪茄 # # (6) Water cigarettes 水烟 # health_status[['Smoking_Type_pipe']]=np.nan # health_status[['Smoking_Type_rolled']]=np.nan # health_status[['Smoking_Type_Filtered']]=np.nan # health_status[['Smoking_Type_Unfiltered']]=np.nan # health_status[['Smoking_Type_Cigar']]=np.nan # health_status[['Smoking_Type_Water']]=np.nan # # 现在/戒烟前平均一天抽多少支香烟? # health_status['Daily_Cigarette_Count']=health_status["da063"] # 在过去的一年, 喝酒吗 # (1) Drink more than once a month. 喝酒,每月超过一次 # (2) Drink but less than once a month 喝酒,但每月少于一次 # (3) None of these 什么都不喝 health_status['Drink_PastYear']=health_status["da067"] # 过去一年内 平均一个月喝几次酒 # (1)Once a month 每月一次 # (2)2-3 times a month 每月2-3次 # (3)Once a week 每周一次 # (4)2-3 times a week 每周2-3次 # (5)4-6 times a week 每周4-6次 # (6)Once a day 每天一次 # (7)Twice a day 一天两次 # (8)More than twice a day 一天超过两次 # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1) # 自上次访问以来的两年内,您是否发作过心脏病? # 1 是 # 0 否 health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else 0 if x["da007_w2_5"] == 2 else np.nan, axis=1) # 自上次访问以来,是否有医生诊断您中风复发? # 1 是 # 0 否 health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else 0 if x["da019_w2_1"] == 2 else np.nan, axis=1) # 合并2011年的慢性病 columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_' ,'da007_12_','da007_13_','da007_14_'] columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'] for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new): health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1) diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']] # 按 'ID' 列合并两个表 health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011")) # 使用 fillna() 来更新字段 for col in columns_to_diseases_new: health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011']) health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9" , "Internet_Usage_LastMonth", "Drink_PastYear"]] data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left") #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分 health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0) health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0) health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0) #词语记忆 health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) #画图 health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \ health_status["dc001s3_score"] + health_status["dc002_score"]+ \ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\ health_status["draw_score"] #心理得分 health_status["dc009_score"] = health_status["dc009"]-1 health_status["dc010_score"] = health_status["dc010"]-1 health_status["dc011_score"] = health_status["dc011"]-1 health_status["dc012_score"] = health_status["dc012"]-1 health_status["dc013_score"] = 4 - health_status["dc013"] health_status["dc014_score"] = health_status["dc014"]-1 health_status["dc015_score"] = health_status["dc015"]-1 health_status["dc016_score"] = 4 - health_status["dc016"] health_status["dc017_score"] = health_status["dc017"]-1 health_status["dc018_score"] = health_status["dc018"]-1 data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"] #睡眠状态 # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天) # (2)Some or a little of the time (1-2 days) 不太多(1-2天) # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) data_2013["sleep_state"] = health_status['dc015'] #ADL health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \ health_status["db014_score"] + health_status["db015_score"] # 是否有管道煤气或天然气? houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 是否带供暖设施(不包括土暖气和可制暖的空调)? houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 供暖所用的主要能源是什么? # (1)Solar 太阳能 # (2)Coal 煤炭、蜂窝煤 # (3)Natural gas 管道天然气或煤气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)Crop residue/Wood buring 秸秆、柴火 # (7)Other 其他 houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) # 做饭用的主要燃料是什么? # (1)Coal 煤炭、蜂窝煤 # (2)Natural gas 管道天然气或煤气 # (3)Marsh gas 沼气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)crop residue/Wood burning 秸秆、柴火 # (7)other 其他 houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection', 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']] data_2013 = pd.merge(data_2013, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left") data_2013["wave"] = year change_columns(data_2013) data_2013 = pd.concat([data_2011, data_2013], axis=0) print("2013 complete") # 2015年 year = "2015" demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta") psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk') blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta") biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta") health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta") health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta") weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta") houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta") #性别#年龄#婚姻状况 # 1 married or partnered # 0 other marital status (separated, divorced, unmarried, or widowed) demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1) #教育 # 0 below high school # 1 high school # 2 college or above #更新2015的教育 demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1) demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) #合并2013年的教育 eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]] # 按 'ID' 列合并两个表 demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013")) # 使用 fillna() 来更新字段 demo['education'] = demo['education_2015'].fillna(demo['education_2013']) # 2015年的出生年 demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1) demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1) #获取随访时间 demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left") data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']] #居住地 # 0 农村 # 1 城市 data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left") #身高#体重#收缩压#舒张压 biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x) biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x) #腰围 biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x) #血压测量后两次的平均 biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x) biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x) biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x) biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2 biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2 #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗 # 1 yes # 0 no biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan) # 步行速度时间 biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2 #身高#体重#收缩压#舒张压 biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]] data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left") #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']] data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left") # 慢性病: # (1) Hypertension 高血压病 # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降) # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高) # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌) # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌) # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病 # (除脂肪肝、肿瘤或癌外) # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病) # (8) Stroke 中风 # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌) # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌) # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题 # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症) # (13) Arthritis or rheumatism 关节炎或风湿病 # (14) Asthma 哮喘 # 体力活动 # 2 vigorous (vigorous activity more than once a week) # 1 moderate (moderate activity more than once a week) # 0 inactive (the rest) health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 1 if x["da051_2_"]==1 else 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) else np.nan ,axis=1) # 抽烟 # 1 抽过烟 # 0 没有抽过烟 health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1) # 喝酒 # 1 喝过酒 # 0 没有喝过酒 health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 0 if x["da069"] == 1 else 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1) # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗? # 1 是 # 0 否 health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 过去两年有没有摔倒? # 1 是 # 0 否 health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # # 您什么时候开始来月经的?(year/age) # health_status['Menarche_Year']=health_status["da026_1"] # health_status['Menarche_Age']=health_status["da026_2"] # # 您什么时候开始绝经的? # health_status['Menopause_Year']=health_status["da028_1"] # health_status['Menopause_Age']=health_status["da028_2"] # # 第一次诊断出您有前列腺疾病是在什么时候? # health_status['Prostate_Issue_Year']=health_status["da030_1"] # health_status['Prostate_Issue_Age']=health_status["da030_2"] # 是否戴眼镜(包括矫正视力镜片)? # 1 是 # 0 否 # 2 失明 # 3 偶尔 health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan) # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间) health_status['Average_Sleep_Hours']=health_status["da049"] # 过去一个月内,您通常午睡多长时间?分钟 health_status['Average_Nap_Minutes']=health_status["da050"] # 您通常每周有没有至少持续做激烈活动十分钟? health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续做中等强度的体力活动十分钟? health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续走路十分钟? health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有多少天做[激烈活动]至少十分钟? health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1) # 您通常每周有多少天做[中等强度的体力活动]至少十分钟? health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1) # 您通常每周有多少天做[走路]至少十分钟? health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1) # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时 # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时 # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时 # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟 # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟 # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟 # health_status['Walking_30Min_PerDay']=health_status["da054_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时 # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时 # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时 # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"] # 活动的原因 # 1 工作需要 # 2 娱乐 # 3 体育锻炼 # 4 其他 # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"] # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"] # health_status["Reason_For_Walking"]= health_status["da051_1_3_"] # 过去一个月是否进行了下列社交活动? # (1) 串门、跟朋友交往 # (2) 打麻将、下棋、打牌、去社区活动室 # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助 # (4) 去公园或者其他场所跳舞、健身、练气功等 # (5) 参加社团组织活动 # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人 # (7) 上学或者参加培训课程 # (8)其他 # (9) 以上均没有 health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1) health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1) health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1) health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1) health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1) health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1) health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1) health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1) health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1) # 过去一个月的活动频率 # (1) Almost daily 差不多每天 # (2) Almost every week 差不多每周 # (3) Not regularly 不经常 # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1) # health_status["da057_7_"] = health_status["da057_8_"] # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1 # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2 # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3 # else np.nan, axis=1) # 过去一个月,您是否上网? health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0) # # 使用以下哪些工具上网? # # 1. Desktop computer 台式电脑 # # 2. Laptop computer 笔记本电脑 # # 3. Tablet computer 平板电脑(如 IPAD) # # 4. Cellphone 手机 # # 5. Other devices 其他设备 # health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3s1"] # health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3s2"] # health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3s3"] # health_status["Internet_Tools_Cellphone"] = health_status["da056_w3s4"] # health_status["Internet_Tools_Other"] = health_status["da056_w3s5"] # # 上网一般做什么? # health_status[["Internet_Purpose_Chat"]] = np.nan # health_status[["Internet_Purpose_news"]] = np.nan # health_status[["Internet_Purpose_videos"]] = np.nan # health_status[["Internet_Purpose_games"]] = np.nan # health_status[["Internet_Purpose_Financial"]] = np.nan # health_status[["Internet_Purpose_Others"]] = np.nan # # 是否会用手机支付 # health_status[["Mobile_Payment"]] = np.nan # # 是否使用微信? # health_status[["Wechat_Usage"]] = np.nan # # 发不发微信朋友圈? # health_status[["Post_Moments"]] = np.nan # # 现在还在吸烟还是戒烟了? # # 1 仍然抽烟 # # 2 戒烟 # health_status['Current_Smoking_Status']=health_status["da061"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else np.nan) # # 更新一下2013年没有回答的用户 # health_status["da061_w3"] = health_status["da061_w3"].apply(lambda x : 1 if x ==1 else 2 if x ==2 else np.nan) # data_2013 = pd.merge(data_2013, health_status[['ID',"da061_w3"]], on='ID', how='left') # data_2013['Current_Smoking_Status'] = data_2013['Current_Smoking_Status'].fillna(data_2013['da061_w3']) # data_2013 = data_2013.drop('da061_w3', axis=1) # # 按 'ID' 列合并两个表 # # 吸烟时,一般抽什么烟? # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟) # # (2) Smoking self-rolled cigarettes 自己卷烟抽 # # (3) Filtered cigarette带滤咀香烟 # # (4) Unfiltered cigarette不带滤咀香烟 # # (5) Cigar雪茄 # # (6) Water cigarettes 水烟 # health_status['Smoking_Type_pipe']=health_status["da060s1"] # health_status['Smoking_Type_rolled']=health_status["da060s2"] # health_status['Smoking_Type_Filtered']=health_status["da060s3"] # health_status['Smoking_Type_Unfiltered']=health_status["da060s4"] # health_status['Smoking_Type_Cigar']=health_status["da060s5"] # health_status['Smoking_Type_Water']=health_status["da060s6"] # # 现在/戒烟前平均一天抽多少支香烟? # health_status['Daily_Cigarette_Count']=health_status["da063"] # 在过去的一年, 喝酒吗 # (1) Drink more than once a month. 喝酒,每月超过一次 # (2) Drink but less than once a month 喝酒,但每月少于一次 # (3) None of these 什么都不喝 health_status['Drink_PastYear']=health_status["da067"] # 过去一年内 平均一个月喝几次酒 # (1)Once a month 每月一次 # (2)2-3 times a month 每月2-3次 # (3)Once a week 每周一次 # (4)2-3 times a week 每周2-3次 # (5)4-6 times a week 每周4-6次 # (6)Once a day 每天一次 # (7)Twice a day 一天两次 # (8)More than twice a day 一天超过两次 # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1) # 自上次访问以来的两年内,您是否发作过心脏病? # 1 是 # 0 否 health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else 0 if x["da007_w2_5"] == 2 else np.nan, axis=1) # 自上次访问以来,是否有医生诊断您中风复发? # 1 是 # 0 否 health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else 0 if x["da019_w2_1"] == 2 else np.nan, axis=1) # 合并2013年的慢性病 columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_' ,'da007_12_','da007_13_','da007_14_'] columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'] for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new): health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1) diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']] # 按 'ID' 列合并两个表 health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013")) # 使用 fillna() 来更新字段 for col in columns_to_diseases_new: health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013']) health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9" , "Internet_Usage_LastMonth", "Drink_PastYear"]] data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left") #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分 health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0) health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0) health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0) #词语记忆 health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0) health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0) health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0) health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0) health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0) health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0) health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0) health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0) health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0) health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0) # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) #画图 health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \ health_status["dc001s3_score"] + health_status["dc002_score"]+ \ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\ health_status["draw_score"] #心理得分 health_status["dc009_score"] = health_status["dc009"]-1 health_status["dc010_score"] = health_status["dc010"]-1 health_status["dc011_score"] = health_status["dc011"]-1 health_status["dc012_score"] = health_status["dc012"]-1 health_status["dc013_score"] = 4 - health_status["dc013"] health_status["dc014_score"] = health_status["dc014"]-1 health_status["dc015_score"] = health_status["dc015"]-1 health_status["dc016_score"] = 4 - health_status["dc016"] health_status["dc017_score"] = health_status["dc017"]-1 health_status["dc018_score"] = health_status["dc018"]-1 data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"] #睡眠状态 # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天) # (2)Some or a little of the time (1-2 days) 不太多(1-2天) # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) data_2015["sleep_state"] = health_status['dc015'] #ADL health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \ health_status["db014_score"] + health_status["db015_score"] # 是否有管道煤气或天然气? houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 是否带供暖设施(不包括土暖气和可制暖的空调)? houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 供暖所用的主要能源是什么? # (1)Solar 太阳能 # (2)Coal 煤炭、蜂窝煤 # (3)Natural gas 管道天然气或煤气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)Crop residue/Wood buring 秸秆、柴火 # (7)Other 其他 houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) # 做饭用的主要燃料是什么? # (1)Coal 煤炭、蜂窝煤 # (2)Natural gas 管道天然气或煤气 # (3)Marsh gas 沼气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)crop residue/Wood burning 秸秆、柴火 # (7)other 其他 houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan) houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection', 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']] data_2015 = pd.merge(data_2015, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left") data_2015["wave"] = year change_columns(data_2015) data_2015 = pd.concat([data_2013, data_2015], axis=0) print("2015 complete") # 2018年 year = "2018" demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta") psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk') health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta") health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta") cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta") weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta") houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing.dta") #性别#年龄#婚姻状况 # 1 married or partnered # 0 other marital status (separated, divorced, unmarried, or widowed) demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1) #教育 # 0 below high school # 1 high school # 2 college or above #更新2015的教育 demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1) demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) # 出生年 demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1) demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1) #获取随访时间 demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left") data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']] #居住地 # 0 农村 # 1 城市 data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left") #身高#体重#腰围#收缩压#舒张压 data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗 data_2018[["Sit_Stand_5x", "Walking_Speed_Time"]] = np.nan #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan # 慢性病: # (1) Hypertension 高血压病 # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降) # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高) # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌) # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌) # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病 # (除脂肪肝、肿瘤或癌外) # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病) # (8) Stroke 中风 # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌) # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌) # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题 # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症) # (13) Arthritis or rheumatism 关节炎或风湿病 # (14) Asthma 哮喘 # 体力活动 # 2 vigorous (vigorous activity more than once a week) # 1 moderate (moderate activity more than once a week) # 0 inactive (the rest) health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else 1 if x["da051_2_"]==1 else 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2) else np.nan ,axis=1) # 抽烟 # 1 抽过烟 # 0 没有抽过烟 health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1) # 喝酒 # 1 喝过酒 # 0 没有喝过酒 health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else 0 if x["da069"] == 1 else 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1) # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗? # 1 是 # 0 否 health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if (not pd.isna(x["da021"]) and x["da021"]==1) or (pd.isna(x["da021"]) and not pd.isna(x["da022"]) )else 0 if (not pd.isna(x["da021"]) and x["da021"]==2) or (pd.isna(x["da021"]) and pd.isna(x["da022"]) ) else np.nan, axis=1) # 过去两年有没有摔倒? # 1 是 # 0 否 health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da023"] ==1 or x["da023_w4"]==1 else 0 if x["da023"] ==2 or x["da023_w4"]==2 else np.nan, axis=1) # # 您什么时候开始来月经的?(year/age) # health_status['Menarche_Year']=health_status["da026_1"] # health_status['Menarche_Age']=health_status["da026_2"] # # 您什么时候开始绝经的? # health_status['Menopause_Year']=health_status["da028_1"] # health_status['Menopause_Age']=health_status["da028_2"] # # 第一次诊断出您有前列腺疾病是在什么时候? # health_status['Prostate_Issue_Year']=health_status["da030_1"] # health_status['Prostate_Issue_Age']=health_status["da030_2"] # 是否戴眼镜(包括矫正视力镜片)? # 1 是 # 0 否 # 2 失明 # 3 偶尔 health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan) # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间) health_status['Average_Sleep_Hours']=health_status["da049"] # 过去一个月内,您通常午睡多长时间?分钟 health_status['Average_Nap_Minutes']=health_status["da050"] # 您通常每周有没有至少持续做激烈活动十分钟? health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续做中等强度的体力活动十分钟? health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续走路十分钟? health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有多少天做[激烈活动]至少十分钟? health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1) # 您通常每周有多少天做[中等强度的体力活动]至少十分钟? health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1) # 您通常每周有多少天做[走路]至少十分钟? health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1) # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时 # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时 # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时 # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟 # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟 # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟 # health_status['Walking_30Min_PerDay']=health_status["da054_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时 # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时 # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时 # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"] # 活动的原因 # 1 工作需要 # 2 娱乐 # 3 体育锻炼 # 4 其他 # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"] # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"] # health_status["Reason_For_Walking"]= health_status["da051_1_3_"] # 过去一个月是否进行了下列社交活动? # (1) 串门、跟朋友交往 # (2) 打麻将、下棋、打牌、去社区活动室 # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助 # (4) 去公园或者其他场所跳舞、健身、练气功等 # (5) 参加社团组织活动 # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人 # (7) 上学或者参加培训课程 # (8)其他 # (9) 以上均没有 health_status["da056_s1"] = health_status.apply(lambda x: 1 if x["da056_s1"]==1 else 0, axis=1) health_status["da056_s2"] = health_status.apply(lambda x: 1 if x["da056_s2"]==2 else 0, axis=1) health_status["da056_s3"] = health_status.apply(lambda x: 1 if x["da056_s3"]==3 else 0, axis=1) health_status["da056_s4"] = health_status.apply(lambda x: 1 if x["da056_s4"]==4 else 0, axis=1) health_status["da056_s5"] = health_status.apply(lambda x: 1 if x["da056_s5"]==5 else 0, axis=1) health_status["da056_s6"] = health_status.apply(lambda x: 1 if x["da056_s6"]==6 or x["da056_s7"]==7 else 0, axis=1) health_status["da056_s7"] = health_status.apply(lambda x: 1 if x["da056_s8"]==8 else 0, axis=1) health_status["da056_s8"] = health_status.apply(lambda x: 1 if x["da056_s9"]==9 or x["da056_s10"]==10 or x["da056_s11"]==11 else 0, axis=1) health_status["da056_s9"] = health_status.apply(lambda x: 1 if x["da056_s12"]==12 else 0, axis=1) # 过去一个月的活动频率 # (1) Almost daily 差不多每天 # (2) Almost every week 差不多每周 # (3) Not regularly 不经常 # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1) # health_status["da057_7_"] = health_status["da057_8_"] # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1 # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2 # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3 # else np.nan, axis=1) # 过去一个月,您是否上网? health_status["Internet_Usage_LastMonth"] = health_status["da056_s10"].apply(lambda x : 1 if x==10 else 0) # 使用以下哪些工具上网? # 1. Desktop computer 台式电脑 # 2. Laptop computer 笔记本电脑 # 3. Tablet computer 平板电脑(如 IPAD) # 4. Cellphone 手机 # # 5. Other devices 其他设备 # health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3_s1"] # health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3_s2"] # health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3_s3"] # health_status["Internet_Tools_Cellphone"] = health_status["da056_w3_s4"] # health_status["Internet_Tools_Other"] = health_status["da056_w3_s5"] # # 上网一般做什么? # health_status["Internet_Purpose_Chat"] = health_status["da056_w4_1_s1"] # health_status["Internet_Purpose_news"] = health_status["da056_w4_1_s2"] # health_status["Internet_Purpose_videos"] = health_status["da056_w4_1_s3"] # health_status["Internet_Purpose_games"] = health_status["da056_w4_1_s4"] # health_status["Internet_Purpose_Financial"] = health_status["da056_w4_1_s5"] # health_status["Internet_Purpose_Others"] = health_status["da056_w4_1_s6"] # # 是否会用手机支付 # # 1 是 # # 0 否 # health_status["Mobile_Payment"] = health_status["da056_w4_2"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 是否使用微信? # health_status["Wechat_Usage"] = health_status["da056_w4_3"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 发不发微信朋友圈? # health_status["Post_Moments"] = health_status["da056_w4_4"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 现在还在吸烟还是戒烟了? # # 1 仍然抽烟 # # 2 戒烟 # health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da061"] == 1 or x["da061_w4"] == 1 else 2 if x["da061"] == 2 or x["da061_w4"] == 2 else np.nan, axis=1) # # 吸烟时,一般抽什么烟? # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟) # # (2) Smoking self-rolled cigarettes 自己卷烟抽 # # (3) Filtered cigarette带滤咀香烟 # # (4) Unfiltered cigarette不带滤咀香烟 # # (5) Cigar雪茄 # # (6) Water cigarettes 水烟 # health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1 # health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2 # health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3 # health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4 # health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5 # health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6 # # 现在/戒烟前平均一天抽多少支香烟? # health_status['Daily_Cigarette_Count']=health_status["da063"] # 在过去的一年, 喝酒吗 # (1) Drink more than once a month. 喝酒,每月超过一次 # (2) Drink but less than once a month 喝酒,但每月少于一次 # (3) None of these 什么都不喝 health_status['Drink_PastYear']=health_status["da067"] # 过去一年内 平均一个月喝几次酒 # (1)Once a month 每月一次 # (2)2-3 times a month 每月2-3次 # (3)Once a week 每周一次 # (4)2-3 times a week 每周2-3次 # (5)4-6 times a week 每周4-6次 # (6)Once a day 每天一次 # (7)Twice a day 一天两次 # (8)More than twice a day 一天超过两次 # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1) # 自上次访问以来的两年内,您是否发作过心脏病? # 1 是 # 0 否 health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else 0 if x["da007_w2_5"] == 2 else np.nan, axis=1) # 自上次访问以来,是否有医生诊断您中风复发? # 1 是 # 0 否 health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else 0 if x["da019_w2_1"] == 2 else np.nan, axis=1) columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_' ,'da007_12_','da007_13_','da007_14_'] columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'] for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new): health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1) diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']] # 按 'ID' 列合并两个表 health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015")) # 使用 fillna() 来更新字段 for col in columns_to_diseases_new: health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015']) health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "da056_s1", "da056_s2", "da056_s3", "da056_s4", "da056_s5", "da056_s6", "da056_s7", "da056_s8", "da056_s9" , "Internet_Usage_LastMonth", "Drink_PastYear"]] data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left") #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分 cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1) cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1) cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1) #词语记忆 cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1) cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1) cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1) cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1) cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1) cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1) # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1) cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1) cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1) # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) #画图 cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan) data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \ cognition["dc001s3_score"] + cognition["dc002_score"]+ \ cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \ cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \ cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \ cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \ cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \ cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \ cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \ cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \ cognition["dc027s9_score"]+cognition["dc027s10_score"]+\ cognition["draw_score"] #心理得分 cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \ cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \ cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"] #睡眠状态 # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天) # (2)Some or a little of the time (1-2 days) 不太多(1-2天) # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) #ADL health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \ health_status["db014_score"] + health_status["db015_score"] # 是否有管道煤气或天然气? houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 是否带供暖设施(不包括土暖气和可制暖的空调)? houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 供暖所用的主要能源是什么? # (1)Solar 太阳能 # (2)Coal 煤炭、蜂窝煤 # (3)Natural gas 管道天然气或煤气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)Crop residue/Wood buring 秸秆、柴火 # (7)Other 其他 houseing["Heating_Energy"] = houseing["i021_w4"].apply(lambda x : 0 if x==8 or x==7 else x ) # 做饭用的主要燃料是什么? # (1)Coal 煤炭、蜂窝煤 # (2)Natural gas 管道天然气或煤气 # (3)Marsh gas 沼气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)crop residue/Wood burning 秸秆、柴火 # (7)other 其他 houseing["Cooking_Fuel"] = houseing["i022_w4"].apply(lambda x : np.nan if x==8 else 0 if x==7 else x ) houseing_select = houseing[['householdID', 'communityID','Gas_Connection', 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']] data_2018 = pd.merge(data_2018, houseing_select, on = ['householdID', 'communityID'], how="left") data_2018["wave"] = year change_columns(data_2018) data_2018 = pd.concat([data_2015, data_2018], axis=0) print("2018 complete") # 2020年 year = "2020" demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta") psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk') health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta") weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta") houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Household_Income.dta") #性别#年龄#婚姻状况 # 1 married or partnered # 0 other marital status (separated, divorced, unmarried, or widowed) demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1) #教育 # 0 below high school # 1 high school # 2 college or above demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1) demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan) #合并2018年的教育 eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]] # 按 'ID' 列合并两个表 demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018")) # 使用 fillna() 来更新字段 demo['education'] = demo['education_2020'].fillna(demo['education_2018']) # 出生年 demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1) demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1) #合并2018年的出生年 birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]] # 按 'ID' 列合并两个表 demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018")) # 使用 fillna() 来更新字段 demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018']) demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018']) #获取随访时间 demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left") demo["ba003"] = 1 data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']] #居住地 # 0 农村 # 1 城市 data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left") #身高#体重#收缩压#舒张压 data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗 data_2020[["Sit_Stand_5x", "Walking_Speed_Time"]] = np.nan #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan # 慢性病: # (1) Hypertension 高血压病 # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降) # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高) # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌) # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌) # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病 # (除脂肪肝、肿瘤或癌外) # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病) # (8) Stroke 中风 # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌) # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌) # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题 # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症) # (13) Arthritis or rheumatism 关节炎或风湿病 # (14) Asthma 哮喘 # 2020年把帕金森和记忆病症分开,需要和以前对齐 # 体力活动 # 2 vigorous (vigorous activity more than once a week) # 1 moderate (moderate activity more than once a week) # 0 inactive (the rest) health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else 1 if x["da032_2_"]==1 else 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2) else np.nan ,axis=1) # 抽烟 # 1 抽过烟 # 0 没有抽过烟 health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1) # 喝酒 # 1 喝过酒 # 0 没有喝过酒 health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else 0 if x["da051"] == 3 else np.nan, axis=1) # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗? # 1 是 # 0 否 health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if x["da019"] ==1 or x["da020"]==1 else 0 if x["da019"] ==2 or x["da020"]==2 else np.nan, axis=1) # 过去两年有没有摔倒? # 1 是 # 0 否 health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da022"] ==1 or x["da023"]==1 else 0 if x["da022"] ==2 or x["da023"]==2 else np.nan, axis=1) # # 您什么时候开始来月经的?(year/age) # health_status[['Menarche_Year']]=np.nan # health_status[['Menarche_Age']]=np.nan # # 您什么时候开始绝经的? # health_status[['Menopause_Year']]=np.nan # health_status[['Menopause_Age']]=np.nan # # 第一次诊断出您有前列腺疾病是在什么时候? # health_status[['Prostate_Issue_Year']]=np.nan # health_status[['Prostate_Issue_Age']]=np.nan # 是否戴眼镜(包括矫正视力镜片)? # 1 是 # 0 否 # 2 失明 # 3 偶尔 health_status[['Wear_Glasses']]=np.nan # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间) health_status['Average_Sleep_Hours']=health_status["da030"] # 过去一个月内,您通常午睡多长时间?分钟 health_status['Average_Nap_Minutes']=health_status["da031"] # 您通常每周有没有至少持续做激烈活动十分钟? health_status['Vigorous_Activity_10Min']=health_status["da032_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续做中等强度的体力活动十分钟? health_status['Moderate_Effort_10Min']=health_status["da032_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有没有至少持续走路十分钟? health_status['Walking_10Min']=health_status["da032_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan) # 您通常每周有多少天做[激烈活动]至少十分钟? health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da033_1_"]) else x["da033_1_"], axis=1) # 您通常每周有多少天做[中等强度的体力活动]至少十分钟? health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da033_2_"]) else x["da033_2_"], axis=1) # 您通常每周有多少天做[走路]至少十分钟? health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da033_3_"]) else x["da033_3_"], axis=1) # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时 # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da034_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时 # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da034_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时 # health_status['Walking_2Hours_PerDay']=health_status["da034_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟 # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da035_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟 # health_status['Moderate_Effort_30Min_PerDay']=health_status["da035_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟 # health_status['Walking_30Min_PerDay']=health_status["da035_3_"] # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时 # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da036_1_"] # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时 # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da036_2_"] # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时 # health_status['Walking_4Hours_PerDay']=health_status["da036_3_"] # 活动的原因 # 1 工作需要 # 2 娱乐 # 3 体育锻炼 # 4 其他 # health_status["Reason_For_Vigorous_Activity"]= health_status["da037_1_"] # health_status["Reason_For_Moderate_Effort"]= health_status["da037_2_"] # health_status["Reason_For_Walking"]= health_status["da037_3_"] # 过去一个月是否进行了下列社交活动? # (1) 串门、跟朋友交往 # (2) 打麻将、下棋、打牌、去社区活动室 # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助 # (4) 去公园或者其他场所跳舞、健身、练气功等 # (5) 参加社团组织活动 # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人 # (7) 上学或者参加培训课程 # (8)其他 # (9) 以上均没有 health_status["da038_s1"] = health_status.apply(lambda x: 1 if x["da038_s1"]==1 else 0, axis=1) health_status["da038_s2"] = health_status.apply(lambda x: 1 if x["da038_s2"]==2 else 0, axis=1) health_status["da038_s3"] = health_status.apply(lambda x: 1 if x["da038_s3"]==3 else 0, axis=1) health_status["da038_s4"] = health_status.apply(lambda x: 1 if x["da038_s4"]==4 else 0, axis=1) health_status["da038_s5"] = health_status.apply(lambda x: 1 if x["da038_s5"]==5 else 0, axis=1) health_status["da038_s6"] = health_status.apply(lambda x: 1 if x["da038_s6"]==6 else 0, axis=1) health_status["da038_s7"] = health_status.apply(lambda x: 1 if x["da038_s7"]==7 else 0, axis=1) health_status["da038_s8"] = health_status.apply(lambda x: 1 if x["da038_s8"]==8 else 0, axis=1) health_status["da038_s9"] = health_status.apply(lambda x: 1 if x["da038_s9"]==9 else 0, axis=1) # 过去一个月的活动频率 # (1) Almost daily 差不多每天 # (2) Almost every week 差不多每周 # (3) Not regularly 不经常 # 过去一个月,您是否上网? # 1 是 # 0 否 health_status["Internet_Usage_LastMonth"] = health_status["da040"].apply(lambda x : 1 if x ==1 else 0) # 使用以下哪些工具上网? # 1. Desktop computer 台式电脑 # 2. Laptop computer 笔记本电脑 # 3. Tablet computer 平板电脑(如 IPAD) # 4. Cellphone 手机 # # 5. Other devices 其他设备 # health_status["Internet_Tools_Desktop_computer"] = health_status["da041_s1"] # health_status["Internet_Tools_Laptop_computer"] = health_status["da041_s2"] # health_status["Internet_Tools_Tablet_computer"] = health_status["da041_s3"] # health_status["Internet_Tools_Cellphone"] = health_status["da041_s4"] # health_status["Internet_Tools_Other"] = health_status["da041_s5"] # # 上网一般做什么? # health_status["Internet_Purpose_Chat"] = health_status["da042_s1"] # health_status["Internet_Purpose_news"] = health_status["da042_s2"] # health_status["Internet_Purpose_videos"] = health_status["da042_s3"] # health_status["Internet_Purpose_games"] = health_status["da042_s4"] # health_status["Internet_Purpose_Financial"] = health_status["da042_s5"] # health_status["Internet_Purpose_Others"] = health_status["da042_s6"] # # 是否会用手机支付 # # 1 是 # # 0 否 # health_status["Mobile_Payment"] = health_status["da043"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 是否使用微信? # health_status["Wechat_Usage"] = health_status["da044"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 发不发微信朋友圈? # health_status["Post_Moments"] = health_status["da045"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) # # 现在还在吸烟还是戒烟了? # # 1 仍然抽烟 # # 2 戒烟 # health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da047"] == 1 else 2 if x["da047"] == 2 else np.nan, axis=1) # # 吸烟时,一般抽什么烟? # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟) # # (2) Smoking self-rolled cigarettes 自己卷烟抽 # # (3) Filtered cigarette带滤咀香烟 # # (4) Unfiltered cigarette不带滤咀香烟 # # (5) Cigar雪茄 # # (6) Water cigarettes 水烟 # health_status.loc[health_status['da048'] == 1, 'Smoking_Type_pipe'] = 1 # health_status.loc[health_status['da048'] == 2, 'Smoking_Type_rolled'] = 2 # health_status.loc[health_status['da048'] == 3, 'Smoking_Type_Filtered'] = 3 # health_status.loc[health_status['da048'] == 4, 'Smoking_Type_Unfiltered'] = 4 # health_status.loc[health_status['da048'] == 5, 'Smoking_Type_Cigar'] = 5 # health_status.loc[health_status['da048'] == 6, 'Smoking_Type_Water'] = 6 # # 现在/戒烟前平均一天抽多少支香烟? # health_status['Daily_Cigarette_Count']=health_status.apply(lambda x : x["da050_1"] if not pd.isna(x["da050_1"]) else x["da050_2"] if not pd.isna(x["da050_2"]) else np.nan, axis=1) # 在过去的一年, 喝酒吗 # (1) Drink more than once a month. 喝酒,每月超过一次 # (2) Drink but less than once a month 喝酒,但每月少于一次 # (3) None of these 什么都不喝 health_status['Drink_PastYear']=health_status["da051"] # 过去一年内 平均一个月喝几次酒 # (1)Once a month 每月一次 # (2)2-3 times a month 每月2-3次 # (3)Once a week 每周一次 # (4)2-3 times a week 每周2-3次 # (5)4-6 times a week 每周4-6次 # (6)Once a day 每天一次 # (7)Twice a day 一天两次 # (8)More than twice a day 一天超过两次 # health_status['Drink_Monthly_Frequency']=health_status["da052"] health_status['da003_12_'] = health_status.apply(process_row, axis=1) columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_' ,'da003_12_','da003_14_','da003_15_'] columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'] for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new): health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1) diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']] # 按 'ID' 列合并两个表 health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018")) # 使用 fillna() 来更新字段 for col in columns_to_diseases_new: health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018']) health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses", "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min" , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days" , "da038_s1", "da038_s2", "da038_s3", "da038_s4", "da038_s5", "da038_s6", "da038_s7", "da038_s8", "da038_s9" , "Internet_Usage_LastMonth", "Drink_PastYear"]] data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left") # 自上次访问以来的两年内,您是否发作过心脏病? # 1 是 # 0 否 data_2020[['Heart_attack_2_years']]=np.nan # 自上次访问以来,是否有医生诊断您中风复发? # 1 是 # 0 否 data_2020[['Recurrent_Stroke']]=np.nan #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分 health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1) health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1) health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1) health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1) health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1) #词语记忆 health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1) health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1) health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1) health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1) health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1) health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1) health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1) health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1) health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1) health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1) health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1) health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1) health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1) health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1) health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1) health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1) health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1) health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1) health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1) health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1) #画图 health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan) data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \ health_status["dc001s3_score"] + health_status["dc002_score"]+ \ health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \ health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \ health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \ health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \ health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \ health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \ health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \ health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \ health_status["dc027s9_score"]+health_status["dc027s10_score"]+\ health_status["draw_score"] #心理得分 health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan) health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan) data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \ health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \ health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"] #睡眠状态 # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天) # (2)Some or a little of the time (1-2 days) 不太多(1-2天) # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天) # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天) data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) #ADL health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan) data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \ health_status["db014_score"] + health_status["db015_score"] # 是否有管道煤气或天然气? houseing["Gas_Connection"] = houseing["i018"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 是否带供暖设施(不包括土暖气和可制暖的空调)? houseing["Heating_Facility"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan) # 供暖所用的主要能源是什么? # (1)Solar 太阳能 # (2)Coal 煤炭、蜂窝煤 # (3)Natural gas 管道天然气或煤气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)Crop residue/Wood buring 秸秆、柴火 # (7)Other 其他 houseing["Heating_Energy"] = houseing["i020"].apply(lambda x : np.nan if x==8 else 0 if x==7 else x ) # 做饭用的主要燃料是什么? # (1)Coal 煤炭、蜂窝煤 # (2)Natural gas 管道天然气或煤气 # (3)Marsh gas 沼气 # (4)Liquefied Petroleum Gas 液化石油气 # (5)Electric 电 # (6)crop residue/Wood burning 秸秆、柴火 # (7)other 其他 houseing["Cooking_Fuel"] = houseing["i021"].apply(lambda x : np.nan if x==9 else 0 if x == 8 or x == 7 else x) houseing_select = houseing[['householdID', 'communityID','Gas_Connection', 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']] data_2020 = pd.merge(data_2020, houseing_select, on = ['householdID', 'communityID'], how="left") data_2020["wave"] = year change_columns(data_2020) data_2020 = pd.concat([data_2018, data_2020], axis=0) #修改地区名称 #省份、城市名称和污染物数据格式对齐 #海东地区->海东市 data_2020['city'] = data_2020['city'].replace('海东地区', '海东市') #北京 -> 北京市 data_2020['city'] = data_2020['city'].replace('北京', '北京市') data_2020['province'] = data_2020['province'].replace('北京', '北京市') #哈尔滨 -> 哈尔滨市 data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市') #天津 -> 天津市 data_2020['city'] = data_2020['city'].replace('天津', '天津市') data_2020['province'] = data_2020['province'].replace('天津', '天津市') #广西省 -> 广西壮族自治区 data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区') #巢湖市 -> 合肥市 data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市') #襄樊市->襄阳市 data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') data_2020.to_csv("charls_paper_2.csv", index=False) print(123)