import pandas as pd import numpy as np #统一列名 def change_columns(df): df.columns = ["id", "birth_year", "sex", "marital_status", "smoking_status", "drinking_status", "vigoro_sports", "moderate_sports", "mild_sports","heart_probl", "BMI", "HbA1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","C_reactive_protein", "education" ] if __name__ == "__main__": df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False) print(df.columns) # age # sex # .u:Unmar # .v:SP NR # 1.man # 2.woman # marital status # .d:DK # .r:Refuse # 1.married # 3.partnered # 4.separated # 5.divorced # 7.widowed # 8.never married # education # .d:DK # .m:Missing # .o:other # .r:Refuse # 1.lt high-school # 3.high-school graduate # 4.some college # 5.college and above # smoking status # .d:DK # .m:Missing # .p:proxy # .r:Refuse # 0.No # 1.Yes # drinking status # .c:no self-completion inter # .d:DK # .m:Missing # .p:proxy # .r:Refuse # 0.no # 1.yes # physical activity level # .d:DK # .m:Missing # .p:proxy # .r:Refuse # 2.> 1 per week # 3.1 per week # 4.1-3 per mon # 5.hardly ever or never # body mass index (BMI) # heart_probl # stroke # glycated haemoglobin (HbA1c) # systolic blood pressure (SBP) # high-density lipoprotein cholesterol (HDL-C) # C-reactive protein # 定义需要检查的值 values_to_check = {1, 2, 3, 4, 5, 6, 7, 8, 95} # wave 1 # 解析core文件 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_core_data_v3.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "heala","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07"]] result["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result[['BMI',"hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_ifs_derived_variables.dta', convert_categoricals=False) df.columns = df.columns.str.lower() result_one = df[["idauniq", "edqual"]] result = pd.merge(result, result_one, on=["idauniq"], how="left") change_columns(result) print(f"wave 1 finish {result.shape}") # wave 2 # 解析core文件 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_core_data_v4.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_2 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07","hedim08"]] result_2["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) # 解析nurse df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False) df.columns = df.columns.str.lower() result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]] result_2 = pd.merge(result_2, result_two, on=["idauniq"], how="left") # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_2 = pd.merge(result_2, result_one, on=["idauniq"], how="left") change_columns(result_2) result = pd.concat([result, result_2], axis=0) print(f"wave 2 finish {result.shape}") # wave 3 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_elsa_data_v4.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_3 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_3["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result_3[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_3 = pd.merge(result_3, result_one, on=["idauniq"], how="left") change_columns(result_3) result = pd.concat([result, result_3], axis=0) print(f"wave 3 finish {result.shape}") # wave 4 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_elsa_data_v3.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_4 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_4["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) # 解析nurse df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_nurse_data.dta', convert_categoricals=False) df.columns = df.columns.str.lower() result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]] result_4 = pd.merge(result_4, result_two, on=["idauniq"], how="left") # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_4 = pd.merge(result_4, result_one, on=["idauniq"], how="left") change_columns(result_4) result = pd.concat([result, result_4], axis=0) print(f"wave 4 finish {result.shape}") # wave 5 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_elsa_data_v4.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_5 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_5["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result_5[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_5 = pd.merge(result_5, result_one, on=["idauniq"], how="left") change_columns(result_5) result = pd.concat([result, result_5], axis=0) print(f"wave 5 finish {result.shape}") # wave 6 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_data_v2.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_6 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_6["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) # 解析nurse df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False) df.columns = df.columns.str.lower() result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]] result_6 = pd.merge(result_6, result_two, on=["idauniq"], how="left") # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_6 = pd.merge(result_6, result_one, on=["idauniq"], how="left") change_columns(result_6) result = pd.concat([result, result_6], axis=0) print(f"wave 6 finish {result.shape}") # wave 7 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_elsa_data.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_7 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_7["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result_7[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_7 = pd.merge(result_7, result_one, on=["idauniq"], how="left") change_columns(result_7) result = pd.concat([result, result_7], axis=0) print(f"wave 7 finish {result.shape}") # wave 8 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_data_eul_v2.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_8 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scako","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_8["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result_8['BMI'] = np.nan # 解析nurse df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False) df.columns = df.columns.str.lower() result_two = df[["idauniq", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]] result_8 = pd.merge(result_8, result_two, on=["idauniq"], how="left") # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_ifs_dvs_eul_v1.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_8 = pd.merge(result_8, result_one, on=["idauniq"], how="left") change_columns(result_8) result = pd.concat([result, result_8], axis=0) print(f"wave 8 finish {result.shape}") # wave 9 df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False) # 将列名统一转为小写 df.columns = df.columns.str.lower() result_9 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scalcm","heacta", "heactb", "heactc"]] #判断是否有心血管疾病 heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]] result_9["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1) result_9[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan # 解析ifs_derived df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False) result_one = df[["idauniq", "edqual"]] result_9 = pd.merge(result_9, result_one, on=["idauniq"], how="left") change_columns(result_9) result = pd.concat([result, result_9], axis=0) print(f"wave 9 finish {result.shape}") print(result.head()) result.to_csv("/root/r_base/UKDA-5050-stata/result_all.csv", index=False)