123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- import pandas as pd
- import numpy as np
- #统一列名
- def change_columns(df):
- df.columns = ["id", "birth_year", "sex", "marital_status", "smoking_status", "drinking_status", "vigoro_sports", "moderate_sports",
- "mild_sports","heart_probl", "BMI", "HbA1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","C_reactive_protein", "education"
- ]
- if __name__ == "__main__":
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
- print(df.columns)
- # age
- # sex
- # .u:Unmar
- # .v:SP NR
- # 1.man
- # 2.woman
- # marital status
- # .d:DK
- # .r:Refuse
- # 1.married
- # 3.partnered
- # 4.separated
- # 5.divorced
- # 7.widowed
- # 8.never married
- # education
- # .d:DK
- # .m:Missing
- # .o:other
- # .r:Refuse
- # 1.lt high-school
- # 3.high-school graduate
- # 4.some college
- # 5.college and above
- # smoking status
- # .d:DK
- # .m:Missing
- # .p:proxy
- # .r:Refuse
- # 0.No
- # 1.Yes
- # drinking status
- # .c:no self-completion inter
- # .d:DK
- # .m:Missing
- # .p:proxy
- # .r:Refuse
- # 0.no
- # 1.yes
- # physical activity level
- # .d:DK
- # .m:Missing
- # .p:proxy
- # .r:Refuse
- # 2.> 1 per week
- # 3.1 per week
- # 4.1-3 per mon
- # 5.hardly ever or never
- # body mass index (BMI)
- # heart_probl
- # stroke
- # glycated haemoglobin (HbA1c)
- # systolic blood pressure (SBP)
- # high-density lipoprotein cholesterol (HDL-C)
- # C-reactive protein
- # 定义需要检查的值
- values_to_check = {1, 2, 3, 4, 5, 6, 7, 8, 95}
- # wave 1
- # 解析core文件
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_core_data_v3.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "heala","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07"]]
- result["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result[['BMI',"hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_ifs_derived_variables.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_one = df[["idauniq", "edqual"]]
- result = pd.merge(result, result_one, on=["idauniq"], how="left")
- change_columns(result)
- print(f"wave 1 finish {result.shape}")
- # wave 2
- # 解析core文件
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_core_data_v4.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_2 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07","hedim08"]]
- result_2["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- # 解析nurse
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_2 = pd.merge(result_2, result_two, on=["idauniq"], how="left")
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_2 = pd.merge(result_2, result_one, on=["idauniq"], how="left")
- change_columns(result_2)
- result = pd.concat([result, result_2], axis=0)
- print(f"wave 2 finish {result.shape}")
- # wave 3
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_elsa_data_v4.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_3 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_3["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_3[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_3 = pd.merge(result_3, result_one, on=["idauniq"], how="left")
- change_columns(result_3)
- result = pd.concat([result, result_3], axis=0)
- print(f"wave 3 finish {result.shape}")
- # wave 4
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_elsa_data_v3.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_4 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_4["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- # 解析nurse
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_nurse_data.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_4 = pd.merge(result_4, result_two, on=["idauniq"], how="left")
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_4 = pd.merge(result_4, result_one, on=["idauniq"], how="left")
- change_columns(result_4)
- result = pd.concat([result, result_4], axis=0)
- print(f"wave 4 finish {result.shape}")
- # wave 5
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_elsa_data_v4.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_5 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_5["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_5[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_5 = pd.merge(result_5, result_one, on=["idauniq"], how="left")
- change_columns(result_5)
- result = pd.concat([result, result_5], axis=0)
- print(f"wave 5 finish {result.shape}")
- # wave 6
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_data_v2.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_6 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_6["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- # 解析nurse
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_6 = pd.merge(result_6, result_two, on=["idauniq"], how="left")
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_6 = pd.merge(result_6, result_one, on=["idauniq"], how="left")
- change_columns(result_6)
- result = pd.concat([result, result_6], axis=0)
- print(f"wave 6 finish {result.shape}")
- # wave 7
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_elsa_data.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_7 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_7["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_7[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_7 = pd.merge(result_7, result_one, on=["idauniq"], how="left")
- change_columns(result_7)
- result = pd.concat([result, result_7], axis=0)
- print(f"wave 7 finish {result.shape}")
- # wave 8
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_data_eul_v2.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_8 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scako","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_8["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_8['BMI'] = np.nan
- # 解析nurse
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_8 = pd.merge(result_8, result_two, on=["idauniq"], how="left")
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_ifs_dvs_eul_v1.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_8 = pd.merge(result_8, result_one, on=["idauniq"], how="left")
- change_columns(result_8)
- result = pd.concat([result, result_8], axis=0)
- print(f"wave 8 finish {result.shape}")
- # wave 9
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
- # 将列名统一转为小写
- df.columns = df.columns.str.lower()
- result_9 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scalcm","heacta", "heactb", "heactc"]]
- #判断是否有心血管疾病
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_9["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_9[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
- # 解析ifs_derived
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_9 = pd.merge(result_9, result_one, on=["idauniq"], how="left")
- change_columns(result_9)
- result = pd.concat([result, result_9], axis=0)
- print(f"wave 9 finish {result.shape}")
- print(result.head())
- result.to_csv("/root/r_base/UKDA-5050-stata/result_all.csv", index=False)
|