ELSA_preprocess.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. import pandas as pd
  2. import numpy as np
  3. #统一列名
  4. def change_columns(df):
  5. df.columns = ["id", "birth_year", "sex", "marital_status", "smoking_status", "drinking_status", "vigoro_sports", "moderate_sports",
  6. "mild_sports","heart_probl", "BMI", "HbA1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","C_reactive_protein", "education"
  7. ]
  8. if __name__ == "__main__":
  9. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
  10. print(df.columns)
  11. # age
  12. # sex
  13. # .u:Unmar
  14. # .v:SP NR
  15. # 1.man
  16. # 2.woman
  17. # marital status
  18. # .d:DK
  19. # .r:Refuse
  20. # 1.married
  21. # 3.partnered
  22. # 4.separated
  23. # 5.divorced
  24. # 7.widowed
  25. # 8.never married
  26. # education
  27. # .d:DK
  28. # .m:Missing
  29. # .o:other
  30. # .r:Refuse
  31. # 1.lt high-school
  32. # 3.high-school graduate
  33. # 4.some college
  34. # 5.college and above
  35. # smoking status
  36. # .d:DK
  37. # .m:Missing
  38. # .p:proxy
  39. # .r:Refuse
  40. # 0.No
  41. # 1.Yes
  42. # drinking status
  43. # .c:no self-completion inter
  44. # .d:DK
  45. # .m:Missing
  46. # .p:proxy
  47. # .r:Refuse
  48. # 0.no
  49. # 1.yes
  50. # physical activity level
  51. # .d:DK
  52. # .m:Missing
  53. # .p:proxy
  54. # .r:Refuse
  55. # 2.> 1 per week
  56. # 3.1 per week
  57. # 4.1-3 per mon
  58. # 5.hardly ever or never
  59. # body mass index (BMI)
  60. # heart_probl
  61. # stroke
  62. # glycated haemoglobin (HbA1c)
  63. # systolic blood pressure (SBP)
  64. # high-density lipoprotein cholesterol (HDL-C)
  65. # C-reactive protein
  66. # 定义需要检查的值
  67. values_to_check = {1, 2, 3, 4, 5, 6, 7, 8, 95}
  68. # wave 1
  69. # 解析core文件
  70. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_core_data_v3.dta', convert_categoricals=False)
  71. # 将列名统一转为小写
  72. df.columns = df.columns.str.lower()
  73. result = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "heala","heacta", "heactb", "heactc"]]
  74. #判断是否有心血管疾病
  75. heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07"]]
  76. result["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  77. result[['BMI',"hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
  78. # 解析ifs_derived
  79. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_ifs_derived_variables.dta', convert_categoricals=False)
  80. df.columns = df.columns.str.lower()
  81. result_one = df[["idauniq", "edqual"]]
  82. result = pd.merge(result, result_one, on=["idauniq"], how="left")
  83. change_columns(result)
  84. print(f"wave 1 finish {result.shape}")
  85. # wave 2
  86. # 解析core文件
  87. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_core_data_v4.dta', convert_categoricals=False)
  88. # 将列名统一转为小写
  89. df.columns = df.columns.str.lower()
  90. result_2 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  91. #判断是否有心血管疾病
  92. heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07","hedim08"]]
  93. result_2["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  94. # 解析nurse
  95. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
  96. df.columns = df.columns.str.lower()
  97. result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
  98. result_2 = pd.merge(result_2, result_two, on=["idauniq"], how="left")
  99. # 解析ifs_derived
  100. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_ifs_derived_variables.dta', convert_categoricals=False)
  101. result_one = df[["idauniq", "edqual"]]
  102. result_2 = pd.merge(result_2, result_one, on=["idauniq"], how="left")
  103. change_columns(result_2)
  104. result = pd.concat([result, result_2], axis=0)
  105. print(f"wave 2 finish {result.shape}")
  106. # wave 3
  107. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_elsa_data_v4.dta', convert_categoricals=False)
  108. # 将列名统一转为小写
  109. df.columns = df.columns.str.lower()
  110. result_3 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  111. #判断是否有心血管疾病
  112. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  113. result_3["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  114. result_3[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
  115. # 解析ifs_derived
  116. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_ifs_derived_variables.dta', convert_categoricals=False)
  117. result_one = df[["idauniq", "edqual"]]
  118. result_3 = pd.merge(result_3, result_one, on=["idauniq"], how="left")
  119. change_columns(result_3)
  120. result = pd.concat([result, result_3], axis=0)
  121. print(f"wave 3 finish {result.shape}")
  122. # wave 4
  123. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_elsa_data_v3.dta', convert_categoricals=False)
  124. # 将列名统一转为小写
  125. df.columns = df.columns.str.lower()
  126. result_4 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  127. #判断是否有心血管疾病
  128. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  129. result_4["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  130. # 解析nurse
  131. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_nurse_data.dta', convert_categoricals=False)
  132. df.columns = df.columns.str.lower()
  133. result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
  134. result_4 = pd.merge(result_4, result_two, on=["idauniq"], how="left")
  135. # 解析ifs_derived
  136. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_ifs_derived_variables.dta', convert_categoricals=False)
  137. result_one = df[["idauniq", "edqual"]]
  138. result_4 = pd.merge(result_4, result_one, on=["idauniq"], how="left")
  139. change_columns(result_4)
  140. result = pd.concat([result, result_4], axis=0)
  141. print(f"wave 4 finish {result.shape}")
  142. # wave 5
  143. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_elsa_data_v4.dta', convert_categoricals=False)
  144. # 将列名统一转为小写
  145. df.columns = df.columns.str.lower()
  146. result_5 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  147. #判断是否有心血管疾病
  148. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  149. result_5["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  150. result_5[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
  151. # 解析ifs_derived
  152. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_ifs_derived_variables.dta', convert_categoricals=False)
  153. result_one = df[["idauniq", "edqual"]]
  154. result_5 = pd.merge(result_5, result_one, on=["idauniq"], how="left")
  155. change_columns(result_5)
  156. result = pd.concat([result, result_5], axis=0)
  157. print(f"wave 5 finish {result.shape}")
  158. # wave 6
  159. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_data_v2.dta', convert_categoricals=False)
  160. # 将列名统一转为小写
  161. df.columns = df.columns.str.lower()
  162. result_6 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  163. #判断是否有心血管疾病
  164. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  165. result_6["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  166. # 解析nurse
  167. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
  168. df.columns = df.columns.str.lower()
  169. result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
  170. result_6 = pd.merge(result_6, result_two, on=["idauniq"], how="left")
  171. # 解析ifs_derived
  172. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_ifs_derived_variables.dta', convert_categoricals=False)
  173. result_one = df[["idauniq", "edqual"]]
  174. result_6 = pd.merge(result_6, result_one, on=["idauniq"], how="left")
  175. change_columns(result_6)
  176. result = pd.concat([result, result_6], axis=0)
  177. print(f"wave 6 finish {result.shape}")
  178. # wave 7
  179. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_elsa_data.dta', convert_categoricals=False)
  180. # 将列名统一转为小写
  181. df.columns = df.columns.str.lower()
  182. result_7 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
  183. #判断是否有心血管疾病
  184. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  185. result_7["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  186. result_7[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
  187. # 解析ifs_derived
  188. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_ifs_derived_variables.dta', convert_categoricals=False)
  189. result_one = df[["idauniq", "edqual"]]
  190. result_7 = pd.merge(result_7, result_one, on=["idauniq"], how="left")
  191. change_columns(result_7)
  192. result = pd.concat([result, result_7], axis=0)
  193. print(f"wave 7 finish {result.shape}")
  194. # wave 8
  195. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_data_eul_v2.dta', convert_categoricals=False)
  196. # 将列名统一转为小写
  197. df.columns = df.columns.str.lower()
  198. result_8 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scako","heacta", "heactb", "heactc"]]
  199. #判断是否有心血管疾病
  200. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  201. result_8["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  202. result_8['BMI'] = np.nan
  203. # 解析nurse
  204. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
  205. df.columns = df.columns.str.lower()
  206. result_two = df[["idauniq", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
  207. result_8 = pd.merge(result_8, result_two, on=["idauniq"], how="left")
  208. # 解析ifs_derived
  209. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_ifs_dvs_eul_v1.dta', convert_categoricals=False)
  210. result_one = df[["idauniq", "edqual"]]
  211. result_8 = pd.merge(result_8, result_one, on=["idauniq"], how="left")
  212. change_columns(result_8)
  213. result = pd.concat([result, result_8], axis=0)
  214. print(f"wave 8 finish {result.shape}")
  215. # wave 9
  216. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
  217. # 将列名统一转为小写
  218. df.columns = df.columns.str.lower()
  219. result_9 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scalcm","heacta", "heactb", "heactc"]]
  220. #判断是否有心血管疾病
  221. heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
  222. result_9["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
  223. result_9[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
  224. # 解析ifs_derived
  225. df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
  226. result_one = df[["idauniq", "edqual"]]
  227. result_9 = pd.merge(result_9, result_one, on=["idauniq"], how="left")
  228. change_columns(result_9)
  229. result = pd.concat([result, result_9], axis=0)
  230. print(f"wave 9 finish {result.shape}")
  231. print(result.head())
  232. result.to_csv("/root/r_base/UKDA-5050-stata/result_all.csv", index=False)