123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- import pandas as pd
- import numpy as np
- def change_columns(df):
- df.columns = ["id", "birth_year", "sex", "marital_status", "smoking_status", "drinking_status", "vigoro_sports", "moderate_sports",
- "mild_sports","heart_probl", "BMI", "HbA1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","C_reactive_protein", "education"
- ]
- if __name__ == "__main__":
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
- print(df.columns)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- values_to_check = {1, 2, 3, 4, 5, 6, 7, 8, 95}
-
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_core_data_v3.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "heala","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07"]]
- result["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result[['BMI',"hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_ifs_derived_variables.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_one = df[["idauniq", "edqual"]]
- result = pd.merge(result, result_one, on=["idauniq"], how="left")
- change_columns(result)
- print(f"wave 1 finish {result.shape}")
-
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_core_data_v4.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_2 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07","hedim08"]]
- result_2["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_2 = pd.merge(result_2, result_two, on=["idauniq"], how="left")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_2 = pd.merge(result_2, result_one, on=["idauniq"], how="left")
- change_columns(result_2)
- result = pd.concat([result, result_2], axis=0)
- print(f"wave 2 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_elsa_data_v4.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_3 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_3["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_3[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_3 = pd.merge(result_3, result_one, on=["idauniq"], how="left")
- change_columns(result_3)
- result = pd.concat([result, result_3], axis=0)
- print(f"wave 3 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_elsa_data_v3.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_4 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_4["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_nurse_data.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_4 = pd.merge(result_4, result_two, on=["idauniq"], how="left")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_4 = pd.merge(result_4, result_one, on=["idauniq"], how="left")
- change_columns(result_4)
- result = pd.concat([result, result_4], axis=0)
- print(f"wave 4 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_elsa_data_v4.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_5 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_5["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_5[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_5 = pd.merge(result_5, result_one, on=["idauniq"], how="left")
- change_columns(result_5)
- result = pd.concat([result, result_5], axis=0)
- print(f"wave 5 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_data_v2.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_6 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_6["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_6 = pd.merge(result_6, result_two, on=["idauniq"], how="left")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_6 = pd.merge(result_6, result_one, on=["idauniq"], how="left")
- change_columns(result_6)
- result = pd.concat([result, result_6], axis=0)
- print(f"wave 6 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_elsa_data.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_7 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_7["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_7[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_7 = pd.merge(result_7, result_one, on=["idauniq"], how="left")
- change_columns(result_7)
- result = pd.concat([result, result_7], axis=0)
- print(f"wave 7 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_data_eul_v2.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_8 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scako","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_8["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_8['BMI'] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
- df.columns = df.columns.str.lower()
- result_two = df[["idauniq", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
- result_8 = pd.merge(result_8, result_two, on=["idauniq"], how="left")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_ifs_dvs_eul_v1.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_8 = pd.merge(result_8, result_one, on=["idauniq"], how="left")
- change_columns(result_8)
- result = pd.concat([result, result_8], axis=0)
- print(f"wave 8 finish {result.shape}")
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
-
- df.columns = df.columns.str.lower()
- result_9 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scalcm","heacta", "heactb", "heactc"]]
-
- heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
- result_9["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
- result_9[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
-
- df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
- result_one = df[["idauniq", "edqual"]]
- result_9 = pd.merge(result_9, result_one, on=["idauniq"], how="left")
- change_columns(result_9)
- result = pd.concat([result, result_9], axis=0)
- print(f"wave 9 finish {result.shape}")
- print(result.head())
- result.to_csv("/root/r_base/UKDA-5050-stata/result_all.csv", index=False)
|