CHARLS_preprocess_main.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. #统一列名
  5. def change_columns(df):
  6. df.columns = ["ID",'householdID','communityID','sex', "birth_year", "marital_status" , 'province', 'city',"Height", "Weight",
  7. "Systolic","Diastolic",
  8. 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
  9. 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
  10. 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  11. 'Liver_Disease', 'Heart_Problems', 'Stroke', ' Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  12. 'Emotional_Nervous_or_Psychiatric_Problems', ' Memory_Related_Disease',' Arthritis_or_Rheumatism','Asthma',
  13. 'Vigorous_Activities', 'Moderate_Physical_Effort','Walking','Vigorous_Activities_day', 'Moderate_Physical_Effort_day',
  14. 'Walking_day','Vigorous_Activities_2h', 'Moderate_Physical_Effort_2h','Walking_2h','Vigorous_Activities_30m',
  15. 'Moderate_Physical_Effort_30m','Walking_30m','Vigorous_Activities_4h', 'Moderate_Physical_Effort_4h','Walking_4h',
  16. 'Smoke', 'Smoke_still','Number_Cigarettes','Drink',
  17. "Cognition_score", "Psychiatric_score", "wave"
  18. ]
  19. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  20. def process_row(row):
  21. da002_12_ = row['da003_12_']
  22. da002_13_ = row['da003_13_']
  23. if da002_12_ == 1 or da002_13_ == 1:
  24. return 1
  25. elif da002_12_ == 2 and da002_13_ == 2:
  26. return 2
  27. elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
  28. return 2
  29. elif pd.isna(da002_12_) and pd.isna(da002_13_):
  30. return np.nan
  31. else:
  32. return np.nan # 预防万一,其余情况下设为NA
  33. def update_da051(value):
  34. if value == 1:
  35. return 3
  36. elif value == 3:
  37. return 1
  38. else:
  39. return value
  40. if __name__ == "__main__":
  41. # 2011年
  42. year = "2011"
  43. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
  44. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
  45. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
  46. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
  47. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
  48. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
  49. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  50. #性别#年龄#居住地#婚姻状况
  51. # 1 Married with spouse present
  52. # 2 Married but not living with spouse temporarily for reasons such as work
  53. # 3 Separated
  54. # 4 Divorced
  55. # 5 Widowed
  56. # 6 Never married
  57. data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','be001']]
  58. #居住地
  59. data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  60. #身高#体重#收缩压#舒张压
  61. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
  62. data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  63. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  64. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  65. blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
  66. data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
  67. # 慢性病:
  68. # (1) Hypertension 高血压病
  69. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  70. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  71. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  72. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  73. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  74. # (除脂肪肝、肿瘤或癌外)
  75. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  76. # (8) Stroke 中风
  77. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  78. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  79. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  80. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  81. # (13) Arthritis or rheumatism 关节炎或风湿病
  82. # (14) Asthma 哮喘
  83. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  84. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  85. ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
  86. ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
  87. ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
  88. ,'da069']]
  89. data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  90. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  91. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  92. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  93. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  94. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  95. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  96. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  97. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  98. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  99. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  100. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  101. #词语记忆
  102. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  103. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  104. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  105. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  106. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  107. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  108. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  109. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  110. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  111. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  112. health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  113. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  114. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  115. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  116. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  117. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  118. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  119. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  120. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  121. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  122. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  123. health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  124. #画图
  125. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
  126. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  127. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  128. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  129. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  130. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  131. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  132. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  133. health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  134. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  135. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  136. health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
  137. health_status["draw_score"]
  138. #心理得分
  139. health_status["dc009_score"] = health_status["dc009"]-1
  140. health_status["dc010_score"] = health_status["dc010"]-1
  141. health_status["dc011_score"] = health_status["dc011"]-1
  142. health_status["dc012_score"] = health_status["dc012"]-1
  143. health_status["dc013_score"] = 4 - health_status["dc013"]
  144. health_status["dc014_score"] = health_status["dc014"]-1
  145. health_status["dc015_score"] = health_status["dc015"]-1
  146. health_status["dc016_score"] = 4 - health_status["dc016"]
  147. health_status["dc017_score"] = health_status["dc017"]-1
  148. health_status["dc018_score"] = health_status["dc018"]-1
  149. data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  150. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  151. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  152. data_2011["wave"] = year
  153. change_columns(data_2011)
  154. # 2013年
  155. year = "2013"
  156. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  157. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
  158. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  159. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  160. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  161. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  162. #性别#年龄#婚姻状况
  163. # 1 Married with spouse present
  164. # 2 Married but not living with spouse temporarily for reasons such as work
  165. # 3 Separated
  166. # 4 Divorced
  167. # 5 Widowed
  168. # 6 Never married
  169. data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','zba002_1','be001']]
  170. #居住地
  171. data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  172. #身高#体重#收缩压#舒张压
  173. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002','qa011','qa012']]
  174. data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  175. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  176. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  177. data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  178. # 慢性病:
  179. # (1) Hypertension 高血压病
  180. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  181. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  182. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  183. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  184. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  185. # (除脂肪肝、肿瘤或癌外)
  186. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  187. # (8) Stroke 中风
  188. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  189. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  190. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  191. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  192. # (13) Arthritis or rheumatism 关节炎或风湿病
  193. # (14) Asthma 哮喘
  194. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  195. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  196. ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
  197. ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
  198. ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
  199. ,'da069']]
  200. data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  201. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  202. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  203. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  204. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  205. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  206. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  207. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  208. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  209. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  210. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  211. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  212. #词语记忆
  213. health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  214. health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  215. health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  216. health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  217. health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  218. health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  219. health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  220. health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  221. health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  222. health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  223. health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  224. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  225. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  226. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  227. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  228. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  229. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  230. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  231. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  232. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  233. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  234. health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  235. #画图
  236. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
  237. data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  238. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  239. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  240. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  241. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  242. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  243. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  244. health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  245. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  246. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  247. health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
  248. health_status["draw_score"]
  249. #心理得分
  250. health_status["dc009_score"] = health_status["dc009"]-1
  251. health_status["dc010_score"] = health_status["dc010"]-1
  252. health_status["dc011_score"] = health_status["dc011"]-1
  253. health_status["dc012_score"] = health_status["dc012"]-1
  254. health_status["dc013_score"] = 4 - health_status["dc013"]
  255. health_status["dc014_score"] = health_status["dc014"]-1
  256. health_status["dc015_score"] = health_status["dc015"]-1
  257. health_status["dc016_score"] = 4 - health_status["dc016"]
  258. health_status["dc017_score"] = health_status["dc017"]-1
  259. health_status["dc018_score"] = health_status["dc018"]-1
  260. data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  261. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  262. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  263. data_2013["wave"] = year
  264. change_columns(data_2013)
  265. data_2013 = pd.concat([data_2011, data_2013], axis=0)
  266. # 2015年
  267. year = "2015"
  268. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  269. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  270. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
  271. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  272. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  273. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  274. #性别#年龄#婚姻状况
  275. # 1 Married with spouse present
  276. # 2 Married but not living with spouse temporarily for reasons such as work
  277. # 3 Separated
  278. # 4 Divorced
  279. # 5 Widowed
  280. # 6 Never married
  281. data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
  282. # 处理出生年的问题
  283. data_2015['ba004_w3_1'] = demo.apply(lambda row: row['ba002_1'] if row['ba002'] == 2 else row['ba004_w3_1'], axis=1)
  284. #居住地
  285. data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  286. #身高#体重#收缩压#舒张压
  287. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'qa011','qa012']]
  288. data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  289. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  290. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  291. blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
  292. data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
  293. # 慢性病:
  294. # (1) Hypertension 高血压病
  295. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  296. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  297. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  298. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  299. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  300. # (除脂肪肝、肿瘤或癌外)
  301. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  302. # (8) Stroke 中风
  303. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  304. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  305. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  306. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  307. # (13) Arthritis or rheumatism 关节炎或风湿病
  308. # (14) Asthma 哮喘
  309. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  310. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  311. ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
  312. ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
  313. ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
  314. ,'da069']]
  315. data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  316. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  317. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  318. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  319. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  320. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  321. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  322. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  323. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  324. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  325. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  326. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  327. #词语记忆
  328. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  329. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  330. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  331. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  332. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  333. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  334. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  335. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  336. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  337. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  338. health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  339. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  340. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  341. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  342. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  343. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  344. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  345. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  346. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  347. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  348. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  349. health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  350. #画图
  351. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0)
  352. data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  353. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  354. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  355. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  356. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  357. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  358. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  359. health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  360. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  361. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  362. health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
  363. health_status["draw_score"]
  364. #心理得分
  365. health_status["dc009_score"] = health_status["dc009"]-1
  366. health_status["dc010_score"] = health_status["dc010"]-1
  367. health_status["dc011_score"] = health_status["dc011"]-1
  368. health_status["dc012_score"] = health_status["dc012"]-1
  369. health_status["dc013_score"] = 4 - health_status["dc013"]
  370. health_status["dc014_score"] = health_status["dc014"]-1
  371. health_status["dc015_score"] = health_status["dc015"]-1
  372. health_status["dc016_score"] = 4 - health_status["dc016"]
  373. health_status["dc017_score"] = health_status["dc017"]-1
  374. health_status["dc018_score"] = health_status["dc018"]-1
  375. data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  376. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  377. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  378. data_2015["wave"] = year
  379. change_columns(data_2015)
  380. data_2015 = pd.concat([data_2013, data_2015], axis=0)
  381. # 2018年
  382. year = "2018"
  383. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  384. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  385. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  386. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  387. cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  388. #性别#年龄#婚姻状况
  389. # 1 Married with spouse present
  390. # 2 Married but not living with spouse temporarily for reasons such as work
  391. # 3 Separated
  392. # 4 Divorced
  393. # 5 Widowed
  394. # 6 Never married
  395. data_2018 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001']]
  396. #居住地
  397. data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  398. #身高#体重#收缩压#舒张压
  399. data_2018[['qi002', 'ql002', 'qa011','qa012']]=np.nan
  400. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  401. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  402. data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  403. # 慢性病:
  404. # (1) Hypertension 高血压病
  405. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  406. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  407. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  408. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  409. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  410. # (除脂肪肝、肿瘤或癌外)
  411. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  412. # (8) Stroke 中风
  413. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  414. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  415. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  416. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  417. # (13) Arthritis or rheumatism 关节炎或风湿病
  418. # (14) Asthma 哮喘
  419. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  420. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  421. ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
  422. ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
  423. ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
  424. ,'da069']]
  425. data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  426. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  427. cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  428. cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  429. cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  430. cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  431. cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  432. cognition["dc019_score"] = cognition["dc014_w4_1_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  433. cognition["dc020_score"] = cognition["dc014_w4_2_1"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  434. cognition["dc021_score"] = cognition["dc014_w4_3_1"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  435. cognition["dc022_score"] = cognition["dc014_w4_4_1"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  436. cognition["dc023_score"] = cognition["dc014_w4_5_1"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  437. #词语记忆
  438. cognition["dc006s1_score"] = cognition["dc028_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  439. cognition["dc006s2_score"] = cognition["dc028_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  440. cognition["dc006s3_score"] = cognition["dc028_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  441. cognition["dc006s4_score"] = cognition["dc028_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  442. cognition["dc006s5_score"] = cognition["dc028_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  443. cognition["dc006s6_score"] = cognition["dc028_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  444. cognition["dc006s7_score"] = cognition["dc028_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  445. cognition["dc006s8_score"] = cognition["dc028_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  446. cognition["dc006s9_score"] = cognition["dc028_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  447. cognition["dc006s10_score"] = cognition["dc028_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  448. cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  449. cognition["dc027s1_score"] = cognition["dc047_w4_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  450. cognition["dc027s2_score"] = cognition["dc047_w4_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  451. cognition["dc027s3_score"] = cognition["dc047_w4_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  452. cognition["dc027s4_score"] = cognition["dc047_w4_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  453. cognition["dc027s5_score"] = cognition["dc047_w4_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  454. cognition["dc027s6_score"] = cognition["dc047_w4_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  455. cognition["dc027s7_score"] = cognition["dc047_w4_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  456. cognition["dc027s8_score"] = cognition["dc047_w4_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  457. cognition["dc027s9_score"] = cognition["dc047_w4_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  458. cognition["dc027s10_score"] = cognition["dc047_w4_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  459. cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  460. #画图
  461. cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0)
  462. data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  463. cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
  464. cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  465. cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  466. cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  467. cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  468. cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  469. cognition["dc006s11_score"] + cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  470. cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  471. cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  472. cognition["dc027s9_score"]+cognition["dc027s10_score"]+cognition["dc027s11_score"]+\
  473. cognition["draw_score"]
  474. #心理得分
  475. cognition["dc009_score"] = cognition["dc009"]-1
  476. cognition["dc010_score"] = cognition["dc010"]-1
  477. cognition["dc011_score"] = cognition["dc011"]-1
  478. cognition["dc012_score"] = cognition["dc012"]-1
  479. cognition["dc013_score"] = 4 - cognition["dc013"]
  480. cognition["dc014_score"] = cognition["dc014"]-1
  481. cognition["dc015_score"] = cognition["dc015"]-1
  482. cognition["dc016_score"] = 4 - cognition["dc016"]
  483. cognition["dc017_score"] = cognition["dc017"]-1
  484. cognition["dc018_score"] = cognition["dc018"]-1
  485. data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
  486. cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
  487. cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
  488. data_2018["wave"] = year
  489. change_columns(data_2018)
  490. data_2018 = pd.concat([data_2015, data_2018], axis=0)
  491. # 2020年
  492. year = "2020"
  493. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  494. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  495. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  496. #性别#年龄#婚姻状况
  497. # 1 Married with spouse present
  498. # 2 Married but not living with spouse temporarily for reasons such as work
  499. # 3 Separated
  500. # 4 Divorced
  501. # 5 Widowed
  502. # 6 Never married
  503. data_2020 = demo[['ID','householdID', 'communityID','ba001', 'ba003_1','ba011']]
  504. #居住地
  505. data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  506. #身高#体重#收缩压#舒张压
  507. data_2020[['qi002', 'ql002', 'qa011','qa012', 'qa013']]=np.nan
  508. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  509. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  510. data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  511. # 慢性病:
  512. # (1) Hypertension 高血压病
  513. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  514. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  515. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  516. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  517. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  518. # (除脂肪肝、肿瘤或癌外)
  519. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  520. # (8) Stroke 中风
  521. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  522. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  523. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  524. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  525. # (13) Arthritis or rheumatism 关节炎或风湿病
  526. # (14) Asthma 哮喘
  527. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  528. health_status['da003_12_'] = health_status.apply(process_row, axis=1)
  529. health_status_select = health_status[['ID','householdID', 'communityID', 'da003_1_', 'da003_2_','da003_3_'
  530. ,'da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
  531. ,'da003_12_','da003_14_','da003_15_','da032_1_','da032_2_', 'da032_3_'
  532. ,'da033_1_','da033_2_','da033_3_','da034_1_','da034_2_','da034_3_','da035_1_','da035_2_','da035_3_'
  533. ,'da036_1_','da036_2_','da036_3_', 'da046','da047','da050_1'
  534. ,'da051']]
  535. health_status_select['da051'] = health_status_select['da051'].apply(update_da051)
  536. data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  537. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  538. health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  539. health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  540. health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  541. health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  542. health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  543. health_status["dc019_score"] = health_status["dc007_1"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  544. health_status["dc020_score"] = health_status["dc007_2"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  545. health_status["dc021_score"] = health_status["dc007_3"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  546. health_status["dc022_score"] = health_status["dc007_4"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  547. health_status["dc023_score"] = health_status["dc007_5"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  548. #词语记忆
  549. health_status["dc006s1_score"] = health_status["dc012_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  550. health_status["dc006s2_score"] = health_status["dc012_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  551. health_status["dc006s3_score"] = health_status["dc012_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  552. health_status["dc006s4_score"] = health_status["dc012_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  553. health_status["dc006s5_score"] = health_status["dc012_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  554. health_status["dc006s6_score"] = health_status["dc012_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  555. health_status["dc006s7_score"] = health_status["dc012_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  556. health_status["dc006s8_score"] = health_status["dc012_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  557. health_status["dc006s9_score"] = health_status["dc012_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  558. health_status["dc006s10_score"] = health_status["dc012_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  559. health_status["dc006s11_score"] = health_status["dc012_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  560. health_status["dc027s1_score"] = health_status["dc028_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  561. health_status["dc027s2_score"] = health_status["dc028_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  562. health_status["dc027s3_score"] = health_status["dc028_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  563. health_status["dc027s4_score"] = health_status["dc028_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  564. health_status["dc027s5_score"] = health_status["dc028_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  565. health_status["dc027s6_score"] = health_status["dc028_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  566. health_status["dc027s7_score"] = health_status["dc028_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  567. health_status["dc027s8_score"] = health_status["dc028_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  568. health_status["dc027s9_score"] = health_status["dc028_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  569. health_status["dc027s10_score"] = health_status["dc028_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  570. health_status["dc027s11_score"] = health_status["dc028_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  571. #画图
  572. health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0)
  573. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  574. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  575. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  576. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  577. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  578. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  579. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  580. health_status["dc006s11_score"] + health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  581. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  582. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  583. health_status["dc027s9_score"]+health_status["dc027s10_score"]+health_status["dc027s11_score"]+\
  584. health_status["draw_score"]
  585. #心理得分
  586. health_status["dc009_score"] = health_status["dc016"]-1
  587. health_status["dc010_score"] = health_status["dc017"]-1
  588. health_status["dc011_score"] = health_status["dc018"]-1
  589. health_status["dc012_score"] = health_status["dc019"]-1
  590. health_status["dc013_score"] = 4 - health_status["dc020"]
  591. health_status["dc014_score"] = health_status["dc021"]-1
  592. health_status["dc015_score"] = health_status["dc022"]-1
  593. health_status["dc016_score"] = 4 - health_status["dc023"]
  594. health_status["dc017_score"] = health_status["dc024"]-1
  595. health_status["dc018_score"] = health_status["dc025"]-1
  596. data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  597. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  598. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  599. data_2020["wave"] = year
  600. change_columns(data_2020)
  601. data_2020 = pd.concat([data_2018, data_2020], axis=0)
  602. #修改地区名称
  603. #省份、城市名称和污染物数据格式对齐
  604. #海东地区->海东市
  605. data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
  606. #北京 -> 北京市
  607. data_2020['city'] = data_2020['city'].replace('北京', '北京市')
  608. data_2020['province'] = data_2020['province'].replace('北京', '北京市')
  609. #哈尔滨 -> 哈尔滨市
  610. data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
  611. #天津 -> 天津市
  612. data_2020['city'] = data_2020['city'].replace('天津', '天津市')
  613. data_2020['province'] = data_2020['province'].replace('天津', '天津市')
  614. #广西省 -> 广西壮族自治区
  615. data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
  616. #巢湖市 -> 合肥市
  617. data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
  618. #襄樊市->襄阳市
  619. data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
  620. data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
  621. print(123)