CHARLS_harmonized.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. from datetime import date
  5. from lunarcalendar import Converter, Lunar
  6. # 定义一个函数,用于更新 harmonized 中的 mstat 列
  7. def update_mstat(harmonized, col_name):
  8. harmonized[col_name] = harmonized[col_name].apply(
  9. lambda x: 1 if x in [1, 3] else 0 if x in [4, 5, 7, 8] else np.nan
  10. )
  11. def update_physical(harmonized):
  12. harmonized["r1phys"] = harmonized.apply(lambda x : 2 if x["r1vgact_c"]==1 else
  13. 1 if x["r1mdact_c"]==1 else
  14. 0 if x["r1ltact_c"] == 1 or (x["r1vgact_c"]==0 and x["r1mdact_c"]==0 and x["r1ltact_c"] == 0)
  15. else np.nan ,axis=1)
  16. harmonized["r2phys"] = harmonized.apply(lambda x : 2 if x["r2vgact_c"]==1 else
  17. 1 if x["r2mdact_c"]==1 else
  18. 0 if x["r2ltact_c"] == 1 or (x["r2vgact_c"]==0 and x["r2mdact_c"]==0 and x["r2ltact_c"] == 0)
  19. else np.nan ,axis=1)
  20. harmonized["r3phys"] = harmonized.apply(lambda x : 2 if x["r3vgact_c"]==1 else
  21. 1 if x["r3mdact_c"]==1 else
  22. 0 if x["r3ltact_c"] == 1 or (x["r3vgact_c"]==0 and x["r3mdact_c"]==0 and x["r3ltact_c"] == 0)
  23. else np.nan ,axis=1)
  24. harmonized["r4phys"] = harmonized.apply(lambda x : 2 if x["r4vgact_c"]==1 else
  25. 1 if x["r4mdact_c"]==1 else
  26. 0 if x["r4ltact_c"] == 1 or (x["r4vgact_c"]==0 and x["r4mdact_c"]==0 and x["r4ltact_c"] == 0)
  27. else np.nan ,axis=1)
  28. def merge_data(harmonized, waves, flag="other"):
  29. merged_data = []
  30. # 遍历年份和列名,处理合并数据
  31. for wave, col_name in waves:
  32. if flag=="mstat":
  33. update_mstat(harmonized, col_name)
  34. elif flag == "phys":
  35. update_physical(harmonized)
  36. # 获取对应年份的数据,并将结果存入列表
  37. merged_data.append(pd.merge(
  38. CHARLS_data[CHARLS_data["wave"] == wave],
  39. harmonized[["ID", col_name]],
  40. on="ID",
  41. how="left"
  42. )[col_name])
  43. return merged_data
  44. # 通过 groupby 采用少数服从多数原则填充性别
  45. def fill_gender(group, col):
  46. # 计算性别众数
  47. mode_gender = group[col].mode()
  48. if not mode_gender.empty:
  49. # 用众数替换组内所有性别值
  50. group[col] = mode_gender[0]
  51. return group
  52. def calculate_age(row):
  53. # 检查空值
  54. if pd.isnull(row['birth_year']) or pd.isnull(row['birth_month']) or pd.isnull(row['iyear']) or pd.isnull(row['imonth']):
  55. return np.nan # 返回 NaN 代表无法计算年龄
  56. # 获取出生年月
  57. birth_year = int(row['birth_year'])
  58. birth_month = int(row['birth_month'])
  59. if birth_month == 0:
  60. birth_month = 6
  61. # 确定出生日期
  62. if row['ba003'] == 1:
  63. # 公历
  64. birth_date = date(birth_year, birth_month, 1)
  65. else:
  66. lunar = Lunar(birth_year, birth_month, 1, isleap=False)
  67. # 农历
  68. birth_date = Converter.Lunar2Solar(lunar)
  69. # 获取随访年月
  70. followup_year = int(row['iyear'])
  71. followup_month = int(row['imonth'])
  72. followup_date = date(followup_year, followup_month, 1)
  73. # 计算年龄
  74. age = followup_date.year - birth_date.year - ((followup_date.month, followup_date.day) < (birth_date.month, birth_date.day))
  75. return age
  76. if __name__ == "__main__":
  77. harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
  78. CHARLS_data = pd.read_csv("CHARLS_data_p_n_m_nd.csv")
  79. harmonized['ID'] = harmonized['ID'].astype(str) # 转换为字符串
  80. CHARLS_data['ID'] = CHARLS_data['ID'].astype(str) # 转换为字符串
  81. #婚姻状况
  82. # 1 married or partnered
  83. # 0 other marital status (separated, divorced, unmarried, or widowed)
  84. # 定义年份和对应的列名
  85. waves = [(2011, "r1mstat"), (2013, "r2mstat"), (2015, "r3mstat"), (2018, "r4mstat")]
  86. # 将四列数据合并为一列,并赋值给 CHARLS_data["mstat"]
  87. CHARLS_data["marital_status_m"] = pd.concat(merge_data(harmonized, waves, "mstat"), ignore_index=True)
  88. #身高
  89. waves = [(2011, "r1mheight"), (2013, "r2mheight"), (2015, "r3mheight")]
  90. CHARLS_data["Height_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  91. #体重
  92. waves = [(2011, "r1mweight"), (2013, "r2mweight"), (2015, "r3mweight")]
  93. CHARLS_data["Weight_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  94. #腰围
  95. waves = [(2011, "r1mwaist"), (2013, "r2mwaist"), (2015, "r3mwaist")]
  96. CHARLS_data["waist_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  97. #BMI
  98. waves = [(2011, "r1mbmi"), (2013, "r2mbmi"), (2015, "r3mbmi")]
  99. CHARLS_data["BMI_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  100. #收缩压#舒张压
  101. waves = [(2011, "r1systo"), (2013, "r2systo"), (2015, "r3systo")]
  102. CHARLS_data["Systolic_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  103. waves = [(2011, "r1diasto"), (2013, "r2diasto"), (2015, "r3diasto")]
  104. CHARLS_data["Diastolic_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  105. # 体力活动
  106. # 2 vigorous (vigorous activity more than once a week)
  107. # 1 moderate (moderate activity more than once a week)
  108. # 0 inactive (the rest)
  109. waves = [(2011, "r1phys"), (2013, "r2phys"), (2015, "r3phys"), (2018, "r4phys")]
  110. CHARLS_data["Physical_activity_m"] = pd.concat(merge_data(harmonized, waves, "phys"), ignore_index=True)
  111. # 抽烟
  112. # 1 抽过烟
  113. # 0 没有抽过烟
  114. waves = [(2011, "r1smokev"), (2013, "r2smokev"), (2015, "r3smokev"), (2018, "r4smokev")]
  115. CHARLS_data["Smoke_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  116. # 喝酒
  117. # 1 喝过酒
  118. # 0 没有喝过酒
  119. waves = [(2011, "r1drinkev"), (2013, "r2drinkev"), (2015, "r3drinkev"), (2018, "r4drinkev")]
  120. CHARLS_data["Drink_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  121. #慢性病
  122. waves = [(2011, "r1hibpe"), (2013, "r2hibpe"), (2015, "r3hibpe"), (2018, "r4hibpe")]
  123. CHARLS_data["Hypertension_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  124. waves = [(2011, "r1diabe"), (2013, "r2diabe"), (2015, "r3diabe"), (2018, "r4diabe")]
  125. CHARLS_data["Disabetes_or_High_Blood_Sugar_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  126. waves = [(2011, "r1cancre"), (2013, "r2cancre"), (2015, "r3cancre"), (2018, "r4cancre")]
  127. CHARLS_data["Cancer_or_Malignant_Tumor_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  128. waves = [(2011, "r1lunge"), (2013, "r2lunge"), (2015, "r3lunge"), (2018, "r4lunge")]
  129. CHARLS_data["Chronic_Lung_Diseases_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  130. waves = [(2011, "r1hearte"), (2013, "r2hearte"), (2015, "r3hearte"), (2018, "r4hearte")]
  131. CHARLS_data["Heart_Problems_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  132. waves = [(2011, "r1psyche"), (2013, "r2psyche"), (2015, "r3psyche"), (2018, "r4psyche")]
  133. CHARLS_data["Emotional_Nervous_or_Psychiatric_Problems_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  134. waves = [(2011, "r1stroke"), (2013, "r2stroke"), (2015, "r3stroke"), (2018, "r4stroke")]
  135. CHARLS_data["Stroke_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  136. waves = [(2011, "r1arthre"), (2013, "r2arthre"), (2015, "r3arthre"), (2018, "r4arthre")]
  137. CHARLS_data["Arthritis_or_Rheumatism_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  138. waves = [(2011, "r1dyslipe"), (2013, "r2dyslipe"), (2015, "r3dyslipe"), (2018, "r4dyslipe")]
  139. CHARLS_data["Dyslipidemia_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  140. waves = [(2011, "r1livere"), (2013, "r2livere"), (2015, "r3livere"), (2018, "r4livere")]
  141. CHARLS_data["Liver_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  142. waves = [(2011, "r1kidneye"), (2013, "r2kidneye"), (2015, "r3kidneye"), (2018, "r4kidneye")]
  143. CHARLS_data["Kidney_Diease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  144. waves = [(2011, "r1digeste"), (2013, "r2digeste"), (2015, "r3digeste"), (2018, "r4digeste")]
  145. CHARLS_data["Stomach_or_Other_Digestive_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  146. waves = [(2011, "r1asthmae"), (2013, "r2asthmae"), (2015, "r3asthmae"), (2018, "r4asthmae")]
  147. CHARLS_data["Asthma_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  148. waves = [(2011, "r1memrye"), (2013, "r2memrye"), (2015, "r3memrye"), (2018, "r4memrye")]
  149. CHARLS_data["Memory_Related_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  150. #心理评分
  151. waves = [(2011, "s1cesd10"), (2013, "s2cesd10"), (2015, "s3cesd10"), (2018, "s4cesd10")]
  152. CHARLS_data["Psychiatric_score_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  153. #睡眠状态
  154. waves = [(2011, "r1sleeprl"), (2013, "r2sleeprl"), (2015, "r3sleeprl"), (2018, "r4sleeprl")]
  155. CHARLS_data["sleep_state_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  156. # ADL
  157. waves = [(2011, "s1adlab_c"), (2013, "s2adlab_c"), (2015, "s3adlab_c"), (2018, "s4adlab_c")]
  158. CHARLS_data["ADL_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  159. #年龄
  160. waves = [(2011, "r1agey"), (2013, "r2agey"), (2015, "r3agey"), (2018, "r4agey")]
  161. CHARLS_data["age_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  162. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  163. waves = [(2011, "r1orient"), (2013, "r2orient"), (2015, "r3orient"), (2018, "r4orient")]
  164. CHARLS_data["Date_Naming"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  165. waves = [(2011, "r1imrc"), (2013, "r2imrc"), (2015, "r3imrc"), (2018, "r4imrc")]
  166. CHARLS_data["Immediate_Word_Recall"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  167. waves = [(2011, "r1dlrc"), (2013, "r2dlrc"), (2015, "r3dlrc"), (2018, "r4dlrc")]
  168. CHARLS_data["Delayed_Word_Recall"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  169. waves = [(2011, "r1ser7"), (2013, "r2ser7"), (2015, "r3ser7"), (2018, "r4ser7")]
  170. CHARLS_data["Serial_7"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  171. waves = [(2011, "r1draw"), (2013, "r2draw"), (2015, "r3draw"), (2018, "r4draw")]
  172. CHARLS_data["Drawing_Picture"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
  173. CHARLS_data["Cognition_score_m"] = CHARLS_data["Date_Naming"] + CHARLS_data["Immediate_Word_Recall"] + CHARLS_data["Delayed_Word_Recall"] + CHARLS_data["Serial_7"] + CHARLS_data["Drawing_Picture"]
  174. # 整体合并的:性别,出生年,教育
  175. #教育
  176. # 0 below high school
  177. # 1 high school
  178. # 2 college or above
  179. harmonized["raeduc_c"] = harmonized["raeduc_c"].apply(lambda x : 1 if x in [3,4,5,6,7,8, 9, 10] else 0 if x in [1,2] else np.nan)
  180. CHARLS_data = pd.merge(CHARLS_data, harmonized[["ID", "ragender", "rabyear", "raeduc_c"]], on='ID', how='left')
  181. #合并
  182. merge_list = ["marital_status_m", "Height_m", "Weight_m", "waist_m", "Systolic_m", "Diastolic_m",
  183. "Physical_activity_m", "Smoke_m", 'Drink_m', 'Hypertension_m', 'Disabetes_or_High_Blood_Sugar_m',
  184. 'Cancer_or_Malignant_Tumor_m', 'Chronic_Lung_Diseases_m', 'Heart_Problems_m', 'Emotional_Nervous_or_Psychiatric_Problems_m',
  185. 'Stroke_m', 'Arthritis_or_Rheumatism_m', 'Dyslipidemia_m', 'Liver_Disease_m', 'Kidney_Diease_m', 'Stomach_or_Other_Digestive_Disease_m',
  186. 'Asthma_m', 'Memory_Related_Disease_m', 'Psychiatric_score_m', 'sleep_state_m', 'Cognition_score_m', "age_m", "ADL_m"]
  187. #先处理身高1~2单位为米,大于3为cm
  188. CHARLS_data['Height'] = CHARLS_data['Height'].apply(lambda x: x if 1 <= x <= 2 else (x / 100 if x > 3 else x))
  189. # 遍历 merge_list 列表
  190. for col_m in merge_list:
  191. col = col_m.replace('_m', '') # 去掉 '_m' 得到相应的列名
  192. if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
  193. CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
  194. # 计算BMI
  195. CHARLS_data['BMI'] = CHARLS_data['Weight'] /(CHARLS_data['Height'] * CHARLS_data['Height'])
  196. CHARLS_data['BMI'] = CHARLS_data["BMI_m"].fillna(CHARLS_data['BMI'])
  197. # 处理慢性病标准不一样,将2变为0
  198. chronic_disease = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  199. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  200. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  201. CHARLS_data[chronic_disease] = CHARLS_data[chronic_disease].replace(2, 0)
  202. #处理"ragender", "rabyear", "raeduc_c"
  203. common_new_list = ["ragender", "rabyear", "raeduc_c"]
  204. common_list = ["rgender", "birth_year", "education"]
  205. for col_m, col in zip(common_new_list, common_list):
  206. if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
  207. CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
  208. CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture", "BMI_m"] + merge_list+ common_new_list)
  209. #处理性别
  210. CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "rgender")).reset_index(drop=True)
  211. #处理出生年月
  212. CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_year")).reset_index(drop=True)
  213. CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_month")).reset_index(drop=True)
  214. #处理教育
  215. CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "education")).reset_index(drop=True)
  216. #重新计算年龄
  217. CHARLS_data["age"] = CHARLS_data.apply(calculate_age, axis=1)
  218. CHARLS_data.to_csv("CHARLS_data_p_n_m_nd_h.csv", index=False)