CLHLS_process.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. import pandas as pd
  2. import pyreadstat
  3. import numpy as np
  4. from savReaderWriter import SavReader
  5. def sav2csv(sav_file_path, csv_file_path):
  6. #读取sav数据
  7. with SavReader(sav_file_path) as reader:
  8. header = reader.header
  9. data = reader.all()
  10. #转化为Dataframe
  11. df = pd.DataFrame(data, columns=header)
  12. # 将列名从字节串转换为字符串
  13. df.columns = [col.decode('utf-8') if isinstance(col, bytes) else col for col in df.columns]
  14. # 另存为csv数据
  15. df.to_csv(csv_file_path, index=False)
  16. #进行adl转换
  17. def trans_adl(columns_adl, data, result, adl_name):
  18. # 定义转换规则
  19. transformation = {1: 0, 2: 1, 3: 2}
  20. for column in columns_adl:
  21. data[column] = data[column].map(transformation).fillna(np.nan) # 将其他转换为缺失值
  22. # 计算转换后列的总和并创建一个新列
  23. data[adl_name] = data[columns_adl].sum(axis=1)
  24. result[adl_name] = data[adl_name].apply(lambda x : 0 if x==0 else (1 if x>0 and x<=12 else np.nan))
  25. #进行mmse转换
  26. def trans_mmse(columns_mmse, data):
  27. # 定义转换规则
  28. transformation = {0: 0, 1: 1}
  29. for column in columns_mmse:
  30. data[column] = data[column].map(transformation).fillna(np.nan) # 将其他转换为缺失值
  31. #进行mmsec16转换
  32. def trans_mmse_c16(columns_mmse, data):
  33. # 定义转换规则
  34. for column in columns_mmse:
  35. data[column] = data[column].apply(lambda x : np.nan if x==88 or x==99 or x==-9 or x==-8 or x==-7 or x==-6 else 1) # 将其他转换为缺失值
  36. def get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, cognitive_name):
  37. # 计算一般能力
  38. result["general_cognitive_"+cognitive_name] = data[columns_cognitive_98].sum(axis=1) *2
  39. # 计算反应能力
  40. result["reaction_"+cognitive_name] = data[columns_reaction_98].sum(axis=1)
  41. # 计算注意力与计算力
  42. result["attention_calculation_"+cognitive_name] = data[columns_attention_98].sum(axis=1)
  43. # 计算回忆力
  44. result["memory_"+cognitive_name] = data[columns_memory_98].sum(axis=1)
  45. # 计算语言能力和自我协调
  46. result["language_selfcoordination_"+cognitive_name] = data[columns_language_98].sum(axis=1)
  47. # 计算总合
  48. result['mmse_'+cognitive_name] = result["general_cognitive_"+cognitive_name] + result["reaction_"+cognitive_name]+ result["attention_calculation_"+cognitive_name]+ result["memory_"+cognitive_name]+ result["language_selfcoordination_"+cognitive_name]
  49. def deal_1998_2018_data():
  50. sav_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.sav"
  51. csv_file_path = "CLHLS/clhls_1998_2018_longitudinal_dataset_released_version1.csv"
  52. # 将sav数据转为csv
  53. # sav2csv(sav_file_path, csv_file_path)
  54. #处理数据
  55. data = pd.read_csv(csv_file_path)
  56. # 存活状态0存活;1死亡;-9失访;-8死亡/失访
  57. result = data[['id', 'dth98_00','dth00_02', 'dth02_05', 'dth02_05', 'dth05_08', 'dth08_11', 'dth11_14', 'dth14_18']]
  58. # 人口特征学变量
  59. # 8/9代表无法回答和缺失
  60. # 年龄
  61. result[['trueage_98','trueage_00','trueage_02','trueage_05','trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_0','vage_2', 'vage_5', 'vage_8','vage_11', 'trueage_14', 'trueage_18']]
  62. # 性别 1男;0女
  63. result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
  64. # 民族 1汉族;0非汉族
  65. result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
  66. # 出生地 1城市;0农村
  67. result['birth_place'] = data['a42'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
  68. # 教育状况 无11年
  69. result['edu_98'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
  70. result['edu_08'] = data['f1_8'].apply(lambda x : np.nan if x==88 or x==99 else x)
  71. result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
  72. result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
  73. # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
  74. result['marital_98'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  75. result['marital_00'] = data['f41_0'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 else 1))
  76. result['marital_02'] = data['f41_2'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
  77. result['marital_05'] = data['f41_5'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 else 1))
  78. result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-6 or x==-8 or x==-7 else 1))
  79. result['marital_08'] = data['f41_8'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
  80. result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
  81. result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  82. result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  83. # 生活是否富裕 1富裕及以上;0一般及以下
  84. result['econ_state_00'] = data['f34_0'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
  85. result['econ_state_02'] = data['f34_2'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  86. result['econ_state_05'] = data['f34_5'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  87. result['econ_state_08'] = data['f34_8'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  88. result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  89. result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  90. result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  91. # 上一年家庭收入 99998超过10万
  92. result['income_02'] = data['f35_2'].apply(lambda x : x if x== 99998 else np.nan)
  93. result['income_05'] = data['f35_5'].apply(lambda x : x if x== 99998 else np.nan)
  94. result['income_08'] = data['f35_8'].apply(lambda x : x if x== 99998 else np.nan)
  95. result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
  96. result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
  97. result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
  98. # 居住状态 1与家庭成员同住;2独居;3在机构居住
  99. result['co_residence_98'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
  100. result['co_residence_00'] = data['a51_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
  101. result['co_residence_02'] = data['a51_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  102. result['co_residence_05'] = data['a51_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  103. result['co_residence_08'] = data['a51_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  104. result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  105. result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
  106. result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
  107. # 目前是否吸烟 1是;2否
  108. result['smoke_98'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
  109. result['smoke_00'] = data['d71_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
  110. result['smoke_02'] = data['d71_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  111. result['smoke_05'] = data['d71_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  112. result['smoke_08'] = data['d71_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  113. result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  114. result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
  115. result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
  116. # 目前是否饮酒 1是;2否
  117. result['drink_98'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
  118. result['drink_00'] = data['d81_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
  119. result['drink_02'] = data['d81_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  120. result['drink_05'] = data['d81_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  121. result['drink_08'] = data['d81_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  122. result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  123. result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
  124. result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  125. # 目前是否锻炼
  126. result['exercise_98'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
  127. result['exercise_00'] = data['d91_0'].apply(lambda x : np.nan if x==9 or x==-9 or x == -6 else x)
  128. result['exercise_02'] = data['d91_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  129. result['exercise_05'] = data['d91_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 else x)
  130. result['exercise_08'] = data['d91_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  131. result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  132. result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
  133. result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  134. # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad;
  135. result['self_reported_helth_98'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
  136. result['self_reported_helth_00'] = data['b12_0'].apply(lambda x : np.nan if x==8 or x==-9 or x == -6 else x)
  137. result['self_reported_helth_02'] = data['b12_2'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
  138. result['self_reported_helth_05'] = data['b12_5'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -6 or x == 8 else x)
  139. result['self_reported_helth_08'] = data['b12_8'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  140. result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  141. result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  142. result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  143. # 慢性病
  144. result['chronic_00'] = data['g14a1_0'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
  145. result['chronic_02'] = data['g14a1_2'].apply(lambda x : np.nan if x==-9 or x == -8 or x == -6 or x == -1 else x)
  146. result['chronic_05'] = data['g14a1_5'].apply(lambda x : np.nan if x==888 or x==999 or x==-9 or x == -8 or x == -6 or x == -1 else x)
  147. result['chronic_08'] = data['g14a1_8'].apply(lambda x : np.nan if x==66 or x==88 or x == 99 or x==-9 or x == -8 or x == -7 or x == -1 else x)
  148. result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
  149. result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
  150. result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
  151. # 抑郁量表得分-only 18年 0无抑郁症;1有抑郁症
  152. # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
  153. # 定义转换规则
  154. transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
  155. # 应用转换规则
  156. columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
  157. for column_one in columns_cesd_one:
  158. data[column_one] = data[column_one].map(transformation_one).fillna(np.nan) # 将8转换为缺失值
  159. # 定义转换规则
  160. transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
  161. # 应用转换规则
  162. columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
  163. for column_two in columns_cesd_two:
  164. data[column_two] = data[column_two].map(transformation_two).fillna(np.nan) # 将8转换为缺失值
  165. result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18']
  166. result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
  167. # 日常生活活动能力 0无残疾;1有残疾
  168. # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
  169. columns_adl_00 = ['e1_0', 'e2_0', 'e3_0', 'e4_0', 'e5_0', 'e6_0']
  170. trans_adl(columns_adl_00, data, result, "adl_00")
  171. columns_adl_02 = ['e1_2', 'e2_2', 'e3_2', 'e4_2', 'e5_2', 'e6_2']
  172. trans_adl(columns_adl_02, data, result, "adl_02")
  173. columns_adl_05 = ['e1_5', 'e2_5', 'e3_5', 'e4_5', 'e5_5', 'e6_5']
  174. trans_adl(columns_adl_05, data, result, "adl_05")
  175. columns_adl_08 = ['e1_8', 'e2_8', 'e3_8', 'e4_8', 'e5_8', 'e6_8']
  176. trans_adl(columns_adl_08, data, result, "adl_08")
  177. columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
  178. trans_adl(columns_adl_11, data, result, "adl_11")
  179. columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
  180. trans_adl(columns_adl_14, data, result, "adl_14")
  181. columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
  182. trans_adl(columns_adl_18, data, result, "adl_18")
  183. # 认知功能 0有认知功能障碍;1认知功能正常
  184. # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
  185. # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
  186. columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a", "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
  187. "c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c21a_0", "c21b_0", "c21c_0", "c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0", "c41a_0", "c41b_0", "c41c_0", "c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0",
  188. "c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c21a_2", "c21b_2", "c21c_2", "c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2", "c41a_2", "c41b_2", "c41c_2", "c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2",
  189. "c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c21a_5", "c21b_5", "c21c_5", "c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5", "c41a_5", "c41b_5", "c41c_5", "c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5",
  190. "c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c21a_8", "c21b_8", "c21c_8", "c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8", "c41a_8", "c41b_8", "c41c_8", "c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8",
  191. "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11", "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
  192. "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14", "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
  193. "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18", "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
  194. trans_mmse(columns_mmse, data)
  195. columns_mmse_c16 = ["c16", "c16_0", "c16_2", "c16_5", "c16_8", "c16_11", "c16_14", "c16_18"]
  196. trans_mmse_c16(columns_mmse_c16, data)
  197. columns_cognitive_98 = ["c11", "c12", "c13", "c14", "c15", "c16"]
  198. columns_reaction_98 = ["c21a", "c21b", "c21c"]
  199. columns_attention_98 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
  200. columns_memory_98 = ["c41a", "c41b", "c41c"]
  201. columns_language_98 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
  202. get_mmse(columns_cognitive_98,columns_reaction_98,columns_attention_98, columns_memory_98 ,columns_language_98, data, result, "98")
  203. columns_cognitive_00 = ["c11_0", "c12_0", "c13_0", "c14_0", "c15_0", "c16_0"]
  204. columns_reaction_00 = ["c21a_0", "c21b_0", "c21c_0"]
  205. columns_attention_00 = ["c31a_0", "c31b_0", "c31c_0", "c31d_0", "c31e_0", "c32_0"]
  206. columns_memory_00 = ["c41a_0", "c41b_0", "c41c_0"]
  207. columns_language_00 = ["c51a_0", "c51b_0", "c52_0", "c53a_0", "c53b_0", "c53c_0"]
  208. get_mmse(columns_cognitive_00,columns_reaction_00,columns_attention_00, columns_memory_00 ,columns_language_00, data, result, "00")
  209. columns_cognitive_02 = ["c11_2", "c12_2", "c13_2", "c14_2", "c15_2", "c16_2"]
  210. columns_reaction_02 = ["c21a_2", "c21b_2", "c21c_2"]
  211. columns_attention_02 = ["c31a_2", "c31b_2", "c31c_2", "c31d_2", "c31e_2", "c32_2"]
  212. columns_memory_02 = ["c41a_2", "c41b_2", "c41c_2"]
  213. columns_language_02 = ["c51a_2", "c51b_2", "c52_2", "c53a_2", "c53b_2", "c53c_2"]
  214. get_mmse(columns_cognitive_02,columns_reaction_02,columns_attention_02, columns_memory_02 ,columns_language_02, data, result, "02")
  215. columns_cognitive_05 = ["c11_5", "c12_5", "c13_5", "c14_5", "c15_5", "c16_5"]
  216. columns_reaction_05 = ["c21a_5", "c21b_5", "c21c_5"]
  217. columns_attention_05 = ["c31a_5", "c31b_5", "c31c_5", "c31d_5", "c31e_5", "c32_5"]
  218. columns_memory_05 = ["c41a_5", "c41b_5", "c41c_5"]
  219. columns_language_05 = ["c51a_5", "c51b_5", "c52_5", "c53a_5", "c53b_5", "c53c_5"]
  220. get_mmse(columns_cognitive_05,columns_reaction_05,columns_attention_05, columns_memory_05 ,columns_language_05, data, result, "05")
  221. columns_cognitive_08 = ["c11_8", "c12_8", "c13_8", "c14_8", "c15_8", "c16_8"]
  222. columns_reaction_08 = ["c21a_8", "c21b_8", "c21c_8"]
  223. columns_attention_08 = ["c31a_8", "c31b_8", "c31c_8", "c31d_8", "c31e_8", "c32_8"]
  224. columns_memory_08 = ["c41a_8", "c41b_8", "c41c_8"]
  225. columns_language_08 = ["c51a_8", "c51b_8", "c52_8", "c53a_8", "c53b_8", "c53c_8"]
  226. get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
  227. columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
  228. columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
  229. columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
  230. columns_memory_11 = ["c41a_11", "c41b_11", "c41c_11"]
  231. columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
  232. get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
  233. columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
  234. columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
  235. columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
  236. columns_memory_14 = ["c41a_14", "c41b_14", "c41c_14"]
  237. columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
  238. get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
  239. columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
  240. columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
  241. columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
  242. columns_memory_18 = ["c41a_18", "c41b_18", "c41c_18"]
  243. columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
  244. get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
  245. # #血液指标
  246. # columns_to_exclude = ['midn', 'trueage', 'a1'] # 替换为你要排除的列名
  247. # #2008
  248. # biomarker_08 = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008-1.tab", sep='\t')
  249. # biomarker_08 = biomarker_08.drop(columns=columns_to_exclude)
  250. # columns_bio = ["id"]
  251. # for col in biomarker_08.columns:
  252. # if not col == "id":
  253. # columns_bio.append(col+"_08")
  254. # biomarker_08.columns = columns_bio
  255. # result = pd.merge(result, biomarker_08, on = ["id"], how="left")
  256. print(result.head())
  257. result.to_csv("CLHLS/clhls_1998_2018_result.csv", index=False)
  258. def deal_2008_2018_data():
  259. sav_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.sav"
  260. csv_file_path = "CLHLS/clhls_2008_2018_longitudinal_dataset_released_version1.csv"
  261. # 将sav数据转为csv
  262. # sav2csv(sav_file_path, csv_file_path)
  263. #处理数据
  264. data = pd.read_csv(csv_file_path)
  265. # 存活状态0存活;1死亡;-9失访;-8死亡/失访
  266. result = data[['id', 'dth08_11', 'dth11_14', 'dth14_18']]
  267. # 人口特征学变量
  268. # 8/9代表无法回答和缺失
  269. # 年龄
  270. result[['trueage_08','trueage_11', 'trueage_14', 'trueage_18']] = data[['trueage','vage_11', 'trueage_14', 'trueage_18']]
  271. # 性别 1男;0女
  272. result['sex'] = data['a1'].apply(lambda x : 1 if x==1 else 0)
  273. # 民族 1汉族;0非汉族
  274. result['ethnic'] = data['a2'].apply(lambda x : 1 if x==1 else 0)
  275. # 出生地 1城市;0农村
  276. result['birth_place'] = data['a43'].apply(lambda x : 1 if x == 1 else (0 if x == 2 else np.nan))
  277. # 教育状况 无11年
  278. result['edu_08'] = data['f1'].apply(lambda x : np.nan if x==88 or x==99 else x)
  279. result['edu_14'] = data['f1_14'].apply(lambda x : np.nan if x==88 or x==99 else x)
  280. result['edu_18'] = data['f1_18'].apply(lambda x : np.nan if x==88 or x==99 else x)
  281. # 婚姻状况 0separated/divorced/widowed/never married; 1currently married and living with spouse
  282. result['marital_08'] = data['f41'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  283. result['marital_11'] = data['f41_11'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 or x==-9 or x==-8 or x==-7 else 1))
  284. result['marital_14'] = data['f41_14'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  285. result['marital_18'] = data['f41_18'].apply(lambda x : 0 if x==2 or x==3 or x==4 or x==5 else (np.nan if x==9 else 1))
  286. # 生活是否富裕 1富裕及以上;0一般及以下
  287. result['econ_state_08'] = data['f34'].apply(lambda x : 0 if x==2 or x==3 else (1 if x==1 else np.nan))
  288. result['econ_state_11'] = data['f34_11'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  289. result['econ_state_14'] = data['f34_14'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  290. result['econ_state_18'] = data['f34_18'].apply(lambda x : 0 if x==4 or x==3 or x==5 else (1 if x==1 or x==2 else np.nan))
  291. # 上一年家庭收入 99998超过10万
  292. result['income_08'] = data['f35'].apply(lambda x : x if x== 99998 else np.nan)
  293. result['income_11'] = data['f35_11'].apply(lambda x : x if x== 99998 else np.nan)
  294. result['income_14'] = data['f35_14'].apply(lambda x : x if x== 99998 else np.nan)
  295. result['income_18'] = data['f35_18'].apply(lambda x : x if x== 99998 else np.nan)
  296. # 居住状态 1与家庭成员同住;2独居;3在机构居住
  297. result['co_residence_08'] = data['a51'].apply(lambda x : np.nan if x==9 else x)
  298. result['co_residence_11'] = data['a51_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  299. result['co_residence_14'] = data['a51_14'].apply(lambda x : np.nan if x==9 else x)
  300. result['co_residence_18'] = data['a51_18'].apply(lambda x : np.nan if x==9 else x)
  301. # 目前是否吸烟 1是;2否
  302. result['smoke_08'] = data['d71'].apply(lambda x : np.nan if x==9 else x)
  303. result['smoke_11'] = data['d71_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  304. result['smoke_14'] = data['d71_14'].apply(lambda x : np.nan if x==9 else x)
  305. result['smoke_18'] = data['d71_18'].apply(lambda x : np.nan if x==9 else x)
  306. # 目前是否饮酒 1是;2否
  307. result['drink_08'] = data['d81'].apply(lambda x : np.nan if x==9 else x)
  308. result['drink_11'] = data['d81_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 else x)
  309. result['drink_14'] = data['d81_14'].apply(lambda x : np.nan if x==9 else x)
  310. result['drink_18'] = data['d81_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  311. # 目前是否锻炼
  312. result['exercise_08'] = data['d91'].apply(lambda x : np.nan if x==9 else x)
  313. result['exercise_11'] = data['d91_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  314. result['exercise_14'] = data['d91_14'].apply(lambda x : np.nan if x==9 else x)
  315. result['exercise_18'] = data['d91_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  316. # 健康状况变量 1very good; 2good; 3so so; 4bad; 5very bad;
  317. result['self_reported_helth_08'] = data['b12'].apply(lambda x : np.nan if x==9 or x==8 else x)
  318. result['self_reported_helth_11'] = data['b12_11'].apply(lambda x : np.nan if x==9 or x==-9 or x == -8 or x == -7 or x == 8 else x)
  319. result['self_reported_helth_14'] = data['b12_14'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  320. result['self_reported_helth_18'] = data['b12_18'].apply(lambda x : np.nan if x==9 or x == 8 else x)
  321. # 慢性病
  322. result['chronic_08'] = data['g14a1'].apply(lambda x : np.nan if x==66 or x==89 or x==99 or x==-9 or x == -6 or x == -1 else x)
  323. result['chronic_11'] = data['g14a1_11'].apply(lambda x : np.nan if x==66 or x==88 or x==99 or x==-9 or x == -8 or x == -7 or x==-1 else x)
  324. result['chronic_14'] = data['g14a1_14'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
  325. result['chronic_18'] = data['g14a1_18'].apply(lambda x : np.nan if x==66 or x==99 or x==88 or x == -1 else x)
  326. # 抑郁量表得分-only 18年 0无抑郁症;1有抑郁症
  327. # 简版流调中心抑郁量表(CESD-10)10个CESD项目,每个项目的分值范围为0到3分,将每个CESD项目的分值相加,得到总得分
  328. # 定义转换规则
  329. transformation_one = {1: 3, 2: 2, 3: 2, 4: 1, 5: 0}
  330. # 应用转换规则
  331. columns_cesd_one = ['b31_18', 'b32_18', 'b33_18', 'b34_18', 'b36_18', 'b38_18', 'b39_18']
  332. for column_one in columns_cesd_one:
  333. data[column_one] = data[column_one].map(transformation_one).fillna(np.nan) # 将8转换为缺失值
  334. # 定义转换规则
  335. transformation_two = {1: 0, 2: 1, 3: 1, 4: 2, 5: 3}
  336. # 应用转换规则
  337. columns_cesd_two = ['b35_18', 'b37_18', 'b310a_18']
  338. for column_two in columns_cesd_two:
  339. data[column_two] = data[column_two].map(transformation_two).fillna(np.nan) # 将8转换为缺失值
  340. result['cesd'] = data['b31_18'] + data['b32_18'] + data['b33_18'] + data['b34_18'] + data['b36_18'] + data['b38_18'] + data['b39_18'] + data['b35_18'] + data['b37_18'] + data['b310a_18']
  341. result['cesd_d'] = result['cesd'].apply(lambda x : 0 if x >= 0 and x <= 15 else (1 if x >=16 and x <= 30 else np.nan))
  342. # 日常生活活动能力 0无残疾;1有残疾
  343. # ADL6个项目bathing, dressing, eating, indoor transferring, toileting, and continence, 每个项目的分值范围是0到2分, 将每个ADL项目的得分相加,得到总得分
  344. columns_adl_08 = ['e1', 'e2', 'e3', 'e4', 'e5', 'e6']
  345. trans_adl(columns_adl_08, data, result, "adl_08")
  346. columns_adl_11 = ['e1_11', 'e2_11', 'e3_11', 'e4_11', 'e5_11', 'e6_11']
  347. trans_adl(columns_adl_11, data, result, "adl_11")
  348. columns_adl_14 = ['e1_14', 'e2_14', 'e3_14', 'e4_14', 'e5_14', 'e6_14']
  349. trans_adl(columns_adl_14, data, result, "adl_14")
  350. columns_adl_18 = ['e1_18', 'e2_18', 'e3_18', 'e4_18', 'e5_18', 'e6_18']
  351. trans_adl(columns_adl_18, data, result, "adl_18")
  352. # 认知功能 0有认知功能障碍;1认知功能正常
  353. # 简易精神状态评价量表(Mini-mental State Examination, MMSE),该量表包括一般能力(12分),反应能力(3分),注意力与计算力(6分),回忆力(3分),语言理解
  354. # 与自我协调能力(6分)5个部分24个问题,总分30分,分数越高,表示认知功能水平越高
  355. columns_mmse = ["c11", "c12", "c13", "c14", "c15", "c21a", "c21b", "c21c", "c31a", "c31b", "c31c", "c31d", "c31e", "c32", "c41a", "c41b", "c41c", "c51a", "c51b", "c52", "c53a", "c53b", "c53c",
  356. "c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c21a_11", "c21b_11", "c21c_11", "c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11", "c41a_11", "c41b_11", "c41c_11", "c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11",
  357. "c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c21a_14", "c21b_14", "c21c_14", "c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14", "c41a_14", "c41b_14", "c41c_14", "c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14",
  358. "c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c21a_18", "c21b_18", "c21c_18", "c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18", "c41a_18", "c41b_18", "c41c_18", "c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
  359. trans_mmse(columns_mmse, data)
  360. columns_mmse_c16 = ["c16", "c16_11", "c16_14", "c16_18"]
  361. trans_mmse_c16(columns_mmse_c16, data)
  362. columns_cognitive_08 = ["c11", "c12", "c13", "c14", "c15", "c16"]
  363. columns_reaction_08 = ["c21a", "c21b", "c21c"]
  364. columns_attention_08 = ["c31a", "c31b", "c31c", "c31d", "c31e", "c32"]
  365. columns_memory_08 = ["c41a", "c41b", "c41c"]
  366. columns_language_08 = ["c51a", "c51b", "c52", "c53a", "c53b", "c53c"]
  367. get_mmse(columns_cognitive_08,columns_reaction_08,columns_attention_08, columns_memory_08 ,columns_language_08, data, result, "08")
  368. columns_cognitive_11 = ["c11_11", "c12_11", "c13_11", "c14_11", "c15_11", "c16_11"]
  369. columns_reaction_11 = ["c21a_11", "c21b_11", "c21c_11"]
  370. columns_attention_11 = ["c31a_11", "c31b_11", "c31c_11", "c31d_11", "c31e_11", "c32_11"]
  371. columns_memory_11 = ["c41a_11", "c41b_11", "c41c_11"]
  372. columns_language_11 = ["c51a_11", "c51b_11", "c52_11", "c53a_11", "c53b_11", "c53c_11"]
  373. get_mmse(columns_cognitive_11,columns_reaction_11,columns_attention_11, columns_memory_11 ,columns_language_11, data, result, "11")
  374. columns_cognitive_14 = ["c11_14", "c12_14", "c13_14", "c14_14", "c15_14", "c16_14"]
  375. columns_reaction_14 = ["c21a_14", "c21b_14", "c21c_14"]
  376. columns_attention_14 = ["c31a_14", "c31b_14", "c31c_14", "c31d_14", "c31e_14", "c32_14"]
  377. columns_memory_14 = ["c41a_14", "c41b_14", "c41c_14"]
  378. columns_language_14 = ["c51a_14", "c51b_14", "c52_14", "c53a_14", "c53b_14", "c53c_14"]
  379. get_mmse(columns_cognitive_14,columns_reaction_14,columns_attention_14, columns_memory_14 ,columns_language_14, data, result, "14")
  380. columns_cognitive_18 = ["c11_18", "c12_18", "c13_18", "c14_18", "c15_18", "c16_18"]
  381. columns_reaction_18 = ["c21a_18", "c21b_18", "c21c_18"]
  382. columns_attention_18 = ["c31a_18", "c31b_18", "c31c_18", "c31d_18", "c31e_18", "c32_18"]
  383. columns_memory_18 = ["c41a_18", "c41b_18", "c41c_18"]
  384. columns_language_18 = ["c51a_18", "c51b_18", "c52_18", "c53a_18", "c53b_18", "c53c_18"]
  385. get_mmse(columns_cognitive_18,columns_reaction_18,columns_attention_18, columns_memory_18 ,columns_language_18, data, result, "18")
  386. # #血液指标
  387. #2008
  388. columns_bio = ["id"]
  389. biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2008.tab", sep='\t')
  390. biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdl"]]
  391. for col in biomarker.columns:
  392. if not col == "id":
  393. columns_bio.append(col+"_08")
  394. biomarker.columns = columns_bio
  395. result = pd.merge(result, biomarker, on = ["id"], how="left")
  396. #2012
  397. columns_bio = ["id"]
  398. biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2012.tab", sep='\t')
  399. biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
  400. for col in biomarker.columns:
  401. if not col == "id":
  402. columns_bio.append(col+"_12")
  403. biomarker.columns = columns_bio
  404. result = pd.merge(result, biomarker, on = ["id"], how="left")
  405. #2014
  406. columns_bio = ["id"]
  407. biomarker = pd.read_csv("CLHLS/biomarker_dataset_CLHLS_2014.tab", sep='\t')
  408. biomarker = biomarker.loc[:,["id", "plt", "lymph", "hdlc"]]
  409. for col in biomarker.columns:
  410. if not col == "id":
  411. columns_bio.append(col+"_14")
  412. biomarker.columns = columns_bio
  413. result = pd.merge(result, biomarker, on = ["id"], how="left")
  414. print(result.head())
  415. result.to_csv("CLHLS/clhls_2008_2018_result.csv", index=False)
  416. if __name__ == "__main__":
  417. # deal_1998_2018_data()
  418. deal_2008_2018_data()
  419. print(123)