CHARLS_preprocess_main.py 79 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002
  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. #统一列名
  5. def change_columns(df):
  6. df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
  7. "waist", "Systolic","Diastolic",
  8. 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
  9. 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
  10. 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  11. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  12. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
  13. 'Physical_activity',
  14. 'Smoke','Drink',
  15. "Cognition_score", "Psychiatric_score","sleep_state", "wave"
  16. ]
  17. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  18. def process_row(row):
  19. da002_12_ = row['da003_12_']
  20. da002_13_ = row['da003_13_']
  21. if da002_12_ == 1 or da002_13_ == 1:
  22. return 1
  23. elif da002_12_ == 2 and da002_13_ == 2:
  24. return 2
  25. elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
  26. return 2
  27. elif pd.isna(da002_12_) and pd.isna(da002_13_):
  28. return np.nan
  29. else:
  30. return np.nan # 预防万一,其余情况下设为NA
  31. def update_da051(value):
  32. if value == 1:
  33. return 3
  34. elif value == 3:
  35. return 1
  36. else:
  37. return value
  38. if __name__ == "__main__":
  39. # 2011年
  40. year = "2011"
  41. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
  42. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
  43. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
  44. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
  45. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
  46. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
  47. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  48. #性别#年龄#居住地#婚姻状况
  49. # 1 married or partnered
  50. # 0 other marital status (separated, divorced, unmarried, or widowed)
  51. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  52. #教育
  53. # 0 below high school
  54. # 1 high school
  55. # 2 college or above
  56. demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  57. data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
  58. #居住地
  59. data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  60. #身高#体重#收缩压#舒张压
  61. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  62. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  63. #腰围
  64. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  65. #血压测量后两次的平均
  66. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  67. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  68. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  69. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  70. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  71. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  72. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic']]
  73. data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  74. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  75. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  76. blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
  77. data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
  78. # 慢性病:
  79. # (1) Hypertension 高血压病
  80. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  81. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  82. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  83. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  84. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  85. # (除脂肪肝、肿瘤或癌外)
  86. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  87. # (8) Stroke 中风
  88. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  89. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  90. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  91. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  92. # (13) Arthritis or rheumatism 关节炎或风湿病
  93. # (14) Asthma 哮喘
  94. # 体力活动
  95. # 2 vigorous (vigorous activity more than once a week)
  96. # 1 moderate (moderate activity more than once a week)
  97. # 0 inactive (the rest)
  98. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  99. 1 if x["da051_2_"]==1 else
  100. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  101. else np.nan ,axis=1)
  102. # 抽烟
  103. # 1 抽过烟
  104. # 0 没有抽过烟
  105. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  106. # 喝酒
  107. # 1 喝过酒
  108. # 0 没有喝过酒
  109. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  110. 0 if x["da069"] == 1 else
  111. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  112. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  113. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  114. ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"]]
  115. data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  116. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆20分、画图1分
  117. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  118. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  119. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  120. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  121. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  122. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  123. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  124. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  125. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  126. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  127. #词语记忆
  128. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  129. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  130. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  131. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  132. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  133. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  134. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  135. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  136. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  137. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  138. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  139. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  140. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  141. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  142. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  143. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  144. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  145. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  146. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  147. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  148. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  149. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  150. #画图
  151. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  152. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  153. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  154. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  155. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  156. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  157. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  158. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  159. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  160. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  161. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  162. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  163. health_status["draw_score"]
  164. #心理得分
  165. health_status["dc009_score"] = health_status["dc009"]-1
  166. health_status["dc010_score"] = health_status["dc010"]-1
  167. health_status["dc011_score"] = health_status["dc011"]-1
  168. health_status["dc012_score"] = health_status["dc012"]-1
  169. health_status["dc013_score"] = 4 - health_status["dc013"]
  170. health_status["dc014_score"] = health_status["dc014"]-1
  171. health_status["dc015_score"] = health_status["dc015"]-1
  172. health_status["dc016_score"] = 4 - health_status["dc016"]
  173. health_status["dc017_score"] = health_status["dc017"]-1
  174. health_status["dc018_score"] = health_status["dc018"]-1
  175. data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  176. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  177. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  178. #睡眠状态
  179. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  180. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  181. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  182. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  183. data_2011["sleep_state"] = health_status['dc015']
  184. data_2011["wave"] = year
  185. change_columns(data_2011)
  186. # 2011年的ID和其他年份有一点区别,倒数第三位加0
  187. data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
  188. # 2013年
  189. year = "2013"
  190. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  191. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
  192. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  193. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  194. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  195. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  196. #性别#年龄#婚姻状况
  197. # 1 married or partnered
  198. # 0 other marital status (separated, divorced, unmarried, or widowed)
  199. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  200. #教育
  201. # 0 below high school
  202. # 1 high school
  203. # 2 college or above
  204. # 纠正2011年统计错误的教育
  205. demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
  206. demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  207. education_correct = demo[['ID',"education_correct"]]
  208. # 按 'ID' 列合并两个表
  209. data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
  210. # 使用 fillna() 来更新字段
  211. data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
  212. # 删除多余的列
  213. data_2011 = data_2011.drop(columns=['education_correct'])
  214. #更新2013的教育
  215. demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  216. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  217. #合并2011年的教育
  218. eductaion_2011 = data_2011[['ID',"education"]]
  219. # 按 'ID' 列合并两个表
  220. demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  221. # 使用 fillna() 来更新字段
  222. demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
  223. # 纠正2011年统计错误的出生年
  224. demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
  225. birth_year_2013 = demo[['ID',"birth_year"]]
  226. # 按 'ID' 列合并两个表
  227. data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
  228. # 使用 fillna() 来更新字段
  229. data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
  230. # 删除多余的列
  231. data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
  232. #合并2011年的出生年
  233. birth_year_2011 = data_2011[['ID',"birth_year"]]
  234. # 按 'ID' 列合并两个表
  235. demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  236. # 使用 fillna() 来更新字段
  237. demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
  238. data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
  239. #居住地
  240. data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  241. #身高#体重#收缩压#舒张压
  242. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  243. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  244. #腰围
  245. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  246. #血压测量后两次的平均
  247. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  248. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  249. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  250. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  251. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  252. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  253. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic']]
  254. data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  255. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  256. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  257. data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  258. # 慢性病:
  259. # (1) Hypertension 高血压病
  260. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  261. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  262. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  263. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  264. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  265. # (除脂肪肝、肿瘤或癌外)
  266. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  267. # (8) Stroke 中风
  268. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  269. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  270. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  271. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  272. # (13) Arthritis or rheumatism 关节炎或风湿病
  273. # (14) Asthma 哮喘
  274. # 体力活动
  275. # 2 vigorous (vigorous activity more than once a week)
  276. # 1 moderate (moderate activity more than once a week)
  277. # 0 inactive (the rest)
  278. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  279. 1 if x["da051_2_"]==1 else
  280. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  281. else np.nan ,axis=1)
  282. # 抽烟
  283. # 1 抽过烟
  284. # 0 没有抽过烟
  285. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  286. # 喝酒
  287. # 1 喝过酒
  288. # 0 没有喝过酒
  289. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  290. 0 if x["da069"] == 1 else
  291. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  292. # 合并2011年的慢性病
  293. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  294. ,'da007_12_','da007_13_','da007_14_']
  295. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  296. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  297. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  298. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  299. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  300. diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  301. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  302. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  303. # 按 'ID' 列合并两个表
  304. health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  305. # 使用 fillna() 来更新字段
  306. for col in columns_to_diseases_new:
  307. health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
  308. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  309. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  310. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  311. data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  312. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  313. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  314. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  315. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  316. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  317. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  318. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  319. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  320. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  321. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  322. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  323. #词语记忆
  324. health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  325. health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  326. health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  327. health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  328. health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  329. health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  330. health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  331. health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  332. health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  333. health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  334. # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  335. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  336. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  337. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  338. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  339. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  340. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  341. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  342. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  343. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  344. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  345. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  346. #画图
  347. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  348. data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  349. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  350. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  351. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  352. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  353. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  354. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  355. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  356. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  357. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  358. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  359. health_status["draw_score"]
  360. #心理得分
  361. health_status["dc009_score"] = health_status["dc009"]-1
  362. health_status["dc010_score"] = health_status["dc010"]-1
  363. health_status["dc011_score"] = health_status["dc011"]-1
  364. health_status["dc012_score"] = health_status["dc012"]-1
  365. health_status["dc013_score"] = 4 - health_status["dc013"]
  366. health_status["dc014_score"] = health_status["dc014"]-1
  367. health_status["dc015_score"] = health_status["dc015"]-1
  368. health_status["dc016_score"] = 4 - health_status["dc016"]
  369. health_status["dc017_score"] = health_status["dc017"]-1
  370. health_status["dc018_score"] = health_status["dc018"]-1
  371. data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  372. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  373. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  374. #睡眠状态
  375. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  376. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  377. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  378. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  379. data_2013["sleep_state"] = health_status['dc015']
  380. data_2013["wave"] = year
  381. change_columns(data_2013)
  382. data_2013 = pd.concat([data_2011, data_2013], axis=0)
  383. # 2015年
  384. year = "2015"
  385. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  386. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  387. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
  388. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  389. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  390. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  391. #性别#年龄#婚姻状况
  392. # 1 married or partnered
  393. # 0 other marital status (separated, divorced, unmarried, or widowed)
  394. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  395. #教育
  396. # 0 below high school
  397. # 1 high school
  398. # 2 college or above
  399. #更新2015的教育
  400. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  401. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  402. #合并2013年的教育
  403. eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
  404. # 按 'ID' 列合并两个表
  405. demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  406. # 使用 fillna() 来更新字段
  407. demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
  408. # 2015年的出生年
  409. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
  410. data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
  411. #居住地
  412. data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  413. #身高#体重#收缩压#舒张压
  414. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  415. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  416. #腰围
  417. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  418. #血压测量后两次的平均
  419. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  420. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  421. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  422. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  423. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  424. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  425. #身高#体重#收缩压#舒张压
  426. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic']]
  427. data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  428. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  429. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  430. blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
  431. data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
  432. # 慢性病:
  433. # (1) Hypertension 高血压病
  434. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  435. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  436. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  437. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  438. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  439. # (除脂肪肝、肿瘤或癌外)
  440. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  441. # (8) Stroke 中风
  442. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  443. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  444. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  445. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  446. # (13) Arthritis or rheumatism 关节炎或风湿病
  447. # (14) Asthma 哮喘
  448. # 体力活动
  449. # 2 vigorous (vigorous activity more than once a week)
  450. # 1 moderate (moderate activity more than once a week)
  451. # 0 inactive (the rest)
  452. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  453. 1 if x["da051_2_"]==1 else
  454. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  455. else np.nan ,axis=1)
  456. # 抽烟
  457. # 1 抽过烟
  458. # 0 没有抽过烟
  459. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  460. # 喝酒
  461. # 1 喝过酒
  462. # 0 没有喝过酒
  463. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  464. 0 if x["da069"] == 1 else
  465. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  466. # 合并2013年的慢性病
  467. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  468. ,'da007_12_','da007_13_','da007_14_']
  469. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  470. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  471. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  472. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  473. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  474. diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  475. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  476. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  477. # 按 'ID' 列合并两个表
  478. health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  479. # 使用 fillna() 来更新字段
  480. for col in columns_to_diseases_new:
  481. health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
  482. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  483. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  484. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  485. data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  486. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  487. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  488. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  489. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  490. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  491. health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  492. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  493. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  494. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  495. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  496. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  497. #词语记忆
  498. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  499. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  500. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  501. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  502. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  503. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  504. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  505. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  506. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  507. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  508. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  509. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  510. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  511. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  512. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  513. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  514. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  515. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  516. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  517. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  518. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  519. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  520. #画图
  521. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  522. data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  523. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  524. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  525. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  526. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  527. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  528. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  529. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  530. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  531. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  532. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  533. health_status["draw_score"]
  534. #心理得分
  535. health_status["dc009_score"] = health_status["dc009"]-1
  536. health_status["dc010_score"] = health_status["dc010"]-1
  537. health_status["dc011_score"] = health_status["dc011"]-1
  538. health_status["dc012_score"] = health_status["dc012"]-1
  539. health_status["dc013_score"] = 4 - health_status["dc013"]
  540. health_status["dc014_score"] = health_status["dc014"]-1
  541. health_status["dc015_score"] = health_status["dc015"]-1
  542. health_status["dc016_score"] = 4 - health_status["dc016"]
  543. health_status["dc017_score"] = health_status["dc017"]-1
  544. health_status["dc018_score"] = health_status["dc018"]-1
  545. data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  546. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  547. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  548. #睡眠状态
  549. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  550. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  551. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  552. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  553. data_2015["sleep_state"] = health_status['dc015']
  554. data_2015["wave"] = year
  555. change_columns(data_2015)
  556. data_2015 = pd.concat([data_2013, data_2015], axis=0)
  557. # 2018年
  558. year = "2018"
  559. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  560. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  561. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  562. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  563. cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  564. #性别#年龄#婚姻状况
  565. # 1 married or partnered
  566. # 0 other marital status (separated, divorced, unmarried, or widowed)
  567. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  568. #教育
  569. # 0 below high school
  570. # 1 high school
  571. # 2 college or above
  572. #更新2015的教育
  573. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
  574. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  575. # 出生年
  576. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
  577. data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
  578. #居住地
  579. data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  580. #身高#体重#收缩压#舒张压
  581. data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
  582. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  583. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  584. data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  585. # 慢性病:
  586. # (1) Hypertension 高血压病
  587. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  588. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  589. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  590. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  591. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  592. # (除脂肪肝、肿瘤或癌外)
  593. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  594. # (8) Stroke 中风
  595. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  596. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  597. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  598. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  599. # (13) Arthritis or rheumatism 关节炎或风湿病
  600. # (14) Asthma 哮喘
  601. # 体力活动
  602. # 2 vigorous (vigorous activity more than once a week)
  603. # 1 moderate (moderate activity more than once a week)
  604. # 0 inactive (the rest)
  605. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  606. 1 if x["da051_2_"]==1 else
  607. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  608. else np.nan ,axis=1)
  609. # 抽烟
  610. # 1 抽过烟
  611. # 0 没有抽过烟
  612. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  613. # 喝酒
  614. # 1 喝过酒
  615. # 0 没有喝过酒
  616. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  617. 0 if x["da069"] == 1 else
  618. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  619. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  620. ,'da007_12_','da007_13_','da007_14_']
  621. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  622. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  623. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  624. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  625. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  626. diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  627. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  628. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  629. # 按 'ID' 列合并两个表
  630. health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
  631. # 使用 fillna() 来更新字段
  632. for col in columns_to_diseases_new:
  633. health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
  634. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  635. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  636. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  637. data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  638. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  639. cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  640. cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  641. cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  642. cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  643. cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  644. cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
  645. cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
  646. cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
  647. cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
  648. cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
  649. #词语记忆
  650. cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
  651. cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
  652. cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
  653. cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
  654. cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
  655. cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
  656. cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
  657. cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
  658. cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
  659. cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
  660. # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  661. cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
  662. cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
  663. cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
  664. cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
  665. cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
  666. cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
  667. cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
  668. cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
  669. cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
  670. cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
  671. # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  672. #画图
  673. cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  674. data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  675. cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
  676. cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  677. cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  678. cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  679. cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  680. cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  681. cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  682. cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  683. cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  684. cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
  685. cognition["draw_score"]
  686. #心理得分
  687. cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  688. cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  689. cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  690. cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  691. cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  692. cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  693. cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  694. cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  695. cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  696. cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  697. data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
  698. cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
  699. cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
  700. #睡眠状态
  701. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  702. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  703. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  704. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  705. data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
  706. data_2018["wave"] = year
  707. change_columns(data_2018)
  708. data_2018 = pd.concat([data_2015, data_2018], axis=0)
  709. # 2020年
  710. year = "2020"
  711. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  712. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  713. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  714. #性别#年龄#婚姻状况
  715. # 1 married or partnered
  716. # 0 other marital status (separated, divorced, unmarried, or widowed)
  717. demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
  718. #教育
  719. # 0 below high school
  720. # 1 high school
  721. # 2 college or above
  722. demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
  723. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  724. #合并2018年的教育
  725. eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
  726. # 按 'ID' 列合并两个表
  727. demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  728. # 使用 fillna() 来更新字段
  729. demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
  730. # 出生年
  731. demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
  732. #合并2018年的出生年
  733. birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
  734. # 按 'ID' 列合并两个表
  735. demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  736. # 使用 fillna() 来更新字段
  737. demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
  738. data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
  739. #居住地
  740. data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  741. #身高#体重#收缩压#舒张压
  742. data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
  743. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  744. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  745. data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  746. # 慢性病:
  747. # (1) Hypertension 高血压病
  748. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  749. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  750. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  751. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  752. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  753. # (除脂肪肝、肿瘤或癌外)
  754. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  755. # (8) Stroke 中风
  756. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  757. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  758. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  759. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  760. # (13) Arthritis or rheumatism 关节炎或风湿病
  761. # (14) Asthma 哮喘
  762. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  763. # 体力活动
  764. # 2 vigorous (vigorous activity more than once a week)
  765. # 1 moderate (moderate activity more than once a week)
  766. # 0 inactive (the rest)
  767. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else
  768. 1 if x["da032_2_"]==1 else
  769. 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2)
  770. else np.nan ,axis=1)
  771. # 抽烟
  772. # 1 抽过烟
  773. # 0 没有抽过烟
  774. health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  775. # 喝酒
  776. # 1 喝过酒
  777. # 0 没有喝过酒
  778. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else
  779. 0 if x["da051"] == 3 else np.nan, axis=1)
  780. health_status['da003_12_'] = health_status.apply(process_row, axis=1)
  781. columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
  782. ,'da003_12_','da003_14_','da003_15_']
  783. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  784. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  785. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  786. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  787. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  788. diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  789. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  790. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  791. # 按 'ID' 列合并两个表
  792. health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  793. # 使用 fillna() 来更新字段
  794. for col in columns_to_diseases_new:
  795. health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
  796. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  797. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  798. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  799. data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  800. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  801. health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  802. health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  803. health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  804. health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  805. health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  806. health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1)
  807. health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1)
  808. health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
  809. health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
  810. health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
  811. #词语记忆
  812. health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
  813. health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
  814. health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
  815. health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1)
  816. health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1)
  817. health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)
  818. health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1)
  819. health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1)
  820. health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)
  821. health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)
  822. health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1)
  823. health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1)
  824. health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1)
  825. health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1)
  826. health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1)
  827. health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)
  828. health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1)
  829. health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1)
  830. health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)
  831. health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)
  832. #画图
  833. health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  834. data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  835. health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
  836. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  837. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  838. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  839. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  840. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  841. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  842. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  843. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  844. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  845. health_status["draw_score"]
  846. #心理得分
  847. health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  848. health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  849. health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  850. health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  851. health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  852. health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  853. health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  854. health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  855. health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  856. health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  857. data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  858. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  859. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  860. #睡眠状态
  861. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  862. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  863. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  864. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  865. data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
  866. data_2020["wave"] = year
  867. change_columns(data_2020)
  868. data_2020 = pd.concat([data_2018, data_2020], axis=0)
  869. #修改地区名称
  870. #省份、城市名称和污染物数据格式对齐
  871. #海东地区->海东市
  872. data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
  873. #北京 -> 北京市
  874. data_2020['city'] = data_2020['city'].replace('北京', '北京市')
  875. data_2020['province'] = data_2020['province'].replace('北京', '北京市')
  876. #哈尔滨 -> 哈尔滨市
  877. data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
  878. #天津 -> 天津市
  879. data_2020['city'] = data_2020['city'].replace('天津', '天津市')
  880. data_2020['province'] = data_2020['province'].replace('天津', '天津市')
  881. #广西省 -> 广西壮族自治区
  882. data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
  883. #巢湖市 -> 合肥市
  884. data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
  885. #襄樊市->襄阳市
  886. data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
  887. data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
  888. print(123)