CHARLS_preprocess_main_paper.py 85 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078
  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. from datetime import date
  5. from lunarcalendar import Converter, Lunar
  6. #统一列名
  7. def change_columns(df):
  8. df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
  9. "waist", "Systolic","Diastolic",
  10. 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
  11. 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
  12. 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  13. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  14. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
  15. 'Physical_activity',
  16. 'Smoke','Drink',
  17. "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
  18. ]
  19. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  20. def process_row(row):
  21. da002_12_ = row['da003_12_']
  22. da002_13_ = row['da003_13_']
  23. if da002_12_ == 1 or da002_13_ == 1:
  24. return 1
  25. elif da002_12_ == 2 and da002_13_ == 2:
  26. return 2
  27. elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
  28. return 2
  29. elif pd.isna(da002_12_) and pd.isna(da002_13_):
  30. return np.nan
  31. else:
  32. return np.nan # 预防万一,其余情况下设为NA
  33. def update_da051(value):
  34. if value == 1:
  35. return 3
  36. elif value == 3:
  37. return 1
  38. else:
  39. return value
  40. if __name__ == "__main__":
  41. # 2011年
  42. year = "2011"
  43. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
  44. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
  45. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
  46. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
  47. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
  48. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
  49. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  50. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
  51. #性别#年龄#居住地#婚姻状况
  52. # 1 married or partnered
  53. # 0 other marital status (separated, divorced, unmarried, or widowed)
  54. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  55. #教育
  56. # 0 not finish primary school/No formal education illiterate
  57. # 1 finish primary school
  58. demo["education"] = demo["bd001"].apply(lambda x : 1 if x in [3,4,5,6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  59. #获取随访时间
  60. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  61. data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
  62. #居住地
  63. data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  64. #身高#体重#收缩压#舒张压
  65. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  66. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  67. #腰围
  68. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  69. #血压测量后两次的平均
  70. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  71. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  72. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  73. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  74. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  75. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  76. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic']]
  77. data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  78. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  79. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  80. blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
  81. data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
  82. # 慢性病:
  83. # (1) Hypertension 高血压病
  84. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  85. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  86. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  87. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  88. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  89. # (除脂肪肝、肿瘤或癌外)
  90. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  91. # (8) Stroke 中风
  92. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  93. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  94. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  95. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  96. # (13) Arthritis or rheumatism 关节炎或风湿病
  97. # (14) Asthma 哮喘
  98. # 体力活动
  99. # 2 vigorous (vigorous activity more than once a week)
  100. # 1 moderate (moderate activity more than once a week)
  101. # 0 inactive (the rest)
  102. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  103. 1 if x["da051_2_"]==1 else
  104. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  105. else np.nan ,axis=1)
  106. # 抽烟
  107. # 1 抽过烟
  108. # 0 没有抽过烟
  109. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  110. # 喝酒
  111. # 1 喝过酒
  112. # 0 没有喝过酒
  113. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  114. 0 if x["da069"] == 1 else
  115. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  116. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  117. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  118. ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"]]
  119. data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  120. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分
  121. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  122. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  123. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  124. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  125. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  126. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  127. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  128. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  129. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  130. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  131. #词语记忆
  132. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  133. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  134. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  135. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  136. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  137. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  138. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  139. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  140. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  141. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  142. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  143. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  144. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  145. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  146. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  147. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  148. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  149. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  150. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  151. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  152. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  153. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  154. #画图
  155. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  156. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  157. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  158. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  159. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  160. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  161. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  162. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  163. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  164. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  165. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  166. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  167. health_status["draw_score"]
  168. #心理得分
  169. health_status["dc009_score"] = health_status["dc009"]-1
  170. health_status["dc010_score"] = health_status["dc010"]-1
  171. health_status["dc011_score"] = health_status["dc011"]-1
  172. health_status["dc012_score"] = health_status["dc012"]-1
  173. health_status["dc013_score"] = 4 - health_status["dc013"]
  174. health_status["dc014_score"] = health_status["dc014"]-1
  175. health_status["dc015_score"] = health_status["dc015"]-1
  176. health_status["dc016_score"] = 4 - health_status["dc016"]
  177. health_status["dc017_score"] = health_status["dc017"]-1
  178. health_status["dc018_score"] = health_status["dc018"]-1
  179. data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  180. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  181. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  182. #睡眠状态
  183. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  184. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  185. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  186. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  187. data_2011["sleep_state"] = health_status['dc015']
  188. #ADL
  189. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  190. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  191. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  192. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  193. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  194. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  195. data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  196. health_status["db014_score"] + health_status["db015_score"]
  197. data_2011["wave"] = year
  198. change_columns(data_2011)
  199. # 2011年的ID和其他年份有一点区别,倒数第三位加0
  200. data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
  201. # 2013年
  202. year = "2013"
  203. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  204. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
  205. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  206. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  207. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  208. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  209. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
  210. #性别#年龄#婚姻状况
  211. # 1 married or partnered
  212. # 0 other marital status (separated, divorced, unmarried, or widowed)
  213. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  214. #教育
  215. # 0 not finish primary school/No formal education illiterate
  216. # 1 finish primary school
  217. # 纠正2011年统计错误的教育
  218. demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
  219. demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  220. education_correct = demo[['ID',"education_correct"]]
  221. # 按 'ID' 列合并两个表
  222. data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
  223. # 使用 fillna() 来更新字段
  224. data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
  225. # 删除多余的列
  226. data_2011 = data_2011.drop(columns=['education_correct'])
  227. #更新2013的教育
  228. demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  229. demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6,7,8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  230. #合并2011年的教育
  231. eductaion_2011 = data_2011[['ID',"education"]]
  232. # 按 'ID' 列合并两个表
  233. demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  234. # 使用 fillna() 来更新字段
  235. demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
  236. # 纠正2011年统计错误的出生年
  237. demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
  238. demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
  239. birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
  240. # 按 'ID' 列合并两个表
  241. data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
  242. # 使用 fillna() 来更新字段
  243. data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
  244. data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
  245. # 删除多余的列
  246. data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
  247. #合并2011年的出生年
  248. birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
  249. # 按 'ID' 列合并两个表
  250. demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  251. # 使用 fillna() 来更新字段
  252. demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
  253. demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
  254. #获取随访时间
  255. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  256. data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
  257. #居住地
  258. data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  259. #身高#体重#收缩压#舒张压
  260. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  261. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  262. #腰围
  263. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  264. #血压测量后两次的平均
  265. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  266. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  267. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  268. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  269. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  270. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  271. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic']]
  272. data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  273. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  274. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  275. data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  276. # 慢性病:
  277. # (1) Hypertension 高血压病
  278. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  279. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  280. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  281. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  282. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  283. # (除脂肪肝、肿瘤或癌外)
  284. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  285. # (8) Stroke 中风
  286. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  287. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  288. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  289. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  290. # (13) Arthritis or rheumatism 关节炎或风湿病
  291. # (14) Asthma 哮喘
  292. # 体力活动
  293. # 2 vigorous (vigorous activity more than once a week)
  294. # 1 moderate (moderate activity more than once a week)
  295. # 0 inactive (the rest)
  296. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  297. 1 if x["da051_2_"]==1 else
  298. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  299. else np.nan ,axis=1)
  300. # 抽烟
  301. # 1 抽过烟
  302. # 0 没有抽过烟
  303. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  304. # 喝酒
  305. # 1 喝过酒
  306. # 0 没有喝过酒
  307. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  308. 0 if x["da069"] == 1 else
  309. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  310. # 合并2011年的慢性病
  311. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  312. ,'da007_12_','da007_13_','da007_14_']
  313. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  314. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  315. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  316. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  317. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  318. diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  319. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  320. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  321. # 按 'ID' 列合并两个表
  322. health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  323. # 使用 fillna() 来更新字段
  324. for col in columns_to_diseases_new:
  325. health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
  326. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  327. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  328. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  329. data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  330. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  331. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  332. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  333. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  334. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  335. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  336. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  337. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  338. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  339. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  340. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  341. #词语记忆
  342. health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  343. health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  344. health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  345. health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  346. health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  347. health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  348. health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  349. health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  350. health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  351. health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  352. # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  353. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  354. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  355. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  356. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  357. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  358. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  359. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  360. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  361. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  362. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  363. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  364. #画图
  365. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  366. data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  367. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  368. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  369. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  370. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  371. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  372. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  373. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  374. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  375. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  376. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  377. health_status["draw_score"]
  378. #心理得分
  379. health_status["dc009_score"] = health_status["dc009"]-1
  380. health_status["dc010_score"] = health_status["dc010"]-1
  381. health_status["dc011_score"] = health_status["dc011"]-1
  382. health_status["dc012_score"] = health_status["dc012"]-1
  383. health_status["dc013_score"] = 4 - health_status["dc013"]
  384. health_status["dc014_score"] = health_status["dc014"]-1
  385. health_status["dc015_score"] = health_status["dc015"]-1
  386. health_status["dc016_score"] = 4 - health_status["dc016"]
  387. health_status["dc017_score"] = health_status["dc017"]-1
  388. health_status["dc018_score"] = health_status["dc018"]-1
  389. data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  390. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  391. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  392. #睡眠状态
  393. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  394. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  395. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  396. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  397. data_2013["sleep_state"] = health_status['dc015']
  398. #ADL
  399. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  400. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  401. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  402. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  403. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  404. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  405. data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  406. health_status["db014_score"] + health_status["db015_score"]
  407. data_2013["wave"] = year
  408. change_columns(data_2013)
  409. data_2013 = pd.concat([data_2011, data_2013], axis=0)
  410. # 2015年
  411. year = "2015"
  412. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  413. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  414. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
  415. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  416. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  417. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  418. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  419. #性别#年龄#婚姻状况
  420. # 1 married or partnered
  421. # 0 other marital status (separated, divorced, unmarried, or widowed)
  422. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  423. #教育
  424. # 0 not finish primary school/No formal education illiterate
  425. # 1 finish primary school
  426. #更新2015的教育
  427. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  428. demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  429. #合并2013年的教育
  430. eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
  431. # 按 'ID' 列合并两个表
  432. demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  433. # 使用 fillna() 来更新字段
  434. demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
  435. # 2015年的出生年
  436. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
  437. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
  438. #获取随访时间
  439. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  440. data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  441. #居住地
  442. data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  443. #身高#体重#收缩压#舒张压
  444. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  445. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  446. #腰围
  447. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  448. #血压测量后两次的平均
  449. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  450. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  451. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  452. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  453. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  454. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  455. #身高#体重#收缩压#舒张压
  456. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic']]
  457. data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  458. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  459. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  460. blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
  461. data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
  462. # 慢性病:
  463. # (1) Hypertension 高血压病
  464. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  465. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  466. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  467. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  468. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  469. # (除脂肪肝、肿瘤或癌外)
  470. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  471. # (8) Stroke 中风
  472. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  473. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  474. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  475. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  476. # (13) Arthritis or rheumatism 关节炎或风湿病
  477. # (14) Asthma 哮喘
  478. # 体力活动
  479. # 2 vigorous (vigorous activity more than once a week)
  480. # 1 moderate (moderate activity more than once a week)
  481. # 0 inactive (the rest)
  482. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  483. 1 if x["da051_2_"]==1 else
  484. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  485. else np.nan ,axis=1)
  486. # 抽烟
  487. # 1 抽过烟
  488. # 0 没有抽过烟
  489. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  490. # 喝酒
  491. # 1 喝过酒
  492. # 0 没有喝过酒
  493. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  494. 0 if x["da069"] == 1 else
  495. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  496. # 合并2013年的慢性病
  497. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  498. ,'da007_12_','da007_13_','da007_14_']
  499. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  500. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  501. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  502. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  503. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  504. diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  505. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  506. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  507. # 按 'ID' 列合并两个表
  508. health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  509. # 使用 fillna() 来更新字段
  510. for col in columns_to_diseases_new:
  511. health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
  512. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  513. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  514. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  515. data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  516. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  517. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  518. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  519. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  520. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  521. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  522. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  523. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  524. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  525. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  526. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  527. #词语记忆
  528. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  529. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  530. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  531. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  532. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  533. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  534. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  535. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  536. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  537. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  538. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  539. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  540. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  541. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  542. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  543. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  544. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  545. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  546. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  547. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  548. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  549. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  550. #画图
  551. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  552. data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  553. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  554. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  555. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  556. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  557. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  558. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  559. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  560. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  561. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  562. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  563. health_status["draw_score"]
  564. #心理得分
  565. health_status["dc009_score"] = health_status["dc009"]-1
  566. health_status["dc010_score"] = health_status["dc010"]-1
  567. health_status["dc011_score"] = health_status["dc011"]-1
  568. health_status["dc012_score"] = health_status["dc012"]-1
  569. health_status["dc013_score"] = 4 - health_status["dc013"]
  570. health_status["dc014_score"] = health_status["dc014"]-1
  571. health_status["dc015_score"] = health_status["dc015"]-1
  572. health_status["dc016_score"] = 4 - health_status["dc016"]
  573. health_status["dc017_score"] = health_status["dc017"]-1
  574. health_status["dc018_score"] = health_status["dc018"]-1
  575. data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  576. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  577. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  578. #睡眠状态
  579. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  580. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  581. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  582. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  583. data_2015["sleep_state"] = health_status['dc015']
  584. #ADL
  585. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  586. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  587. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  588. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  589. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  590. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  591. data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  592. health_status["db014_score"] + health_status["db015_score"]
  593. data_2015["wave"] = year
  594. change_columns(data_2015)
  595. data_2015 = pd.concat([data_2013, data_2015], axis=0)
  596. # 2018年
  597. year = "2018"
  598. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  599. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  600. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  601. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  602. cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  603. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  604. #性别#年龄#婚姻状况
  605. # 1 married or partnered
  606. # 0 other marital status (separated, divorced, unmarried, or widowed)
  607. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  608. #教育
  609. # 0 not finish primary school/No formal education illiterate
  610. # 1 finish primary school
  611. #更新2015的教育
  612. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
  613. demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  614. # 出生年
  615. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
  616. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
  617. #获取随访时间
  618. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  619. data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  620. #居住地
  621. data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  622. #身高#体重#收缩压#舒张压
  623. data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
  624. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  625. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  626. data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  627. # 慢性病:
  628. # (1) Hypertension 高血压病
  629. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  630. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  631. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  632. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  633. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  634. # (除脂肪肝、肿瘤或癌外)
  635. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  636. # (8) Stroke 中风
  637. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  638. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  639. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  640. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  641. # (13) Arthritis or rheumatism 关节炎或风湿病
  642. # (14) Asthma 哮喘
  643. # 体力活动
  644. # 2 vigorous (vigorous activity more than once a week)
  645. # 1 moderate (moderate activity more than once a week)
  646. # 0 inactive (the rest)
  647. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  648. 1 if x["da051_2_"]==1 else
  649. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  650. else np.nan ,axis=1)
  651. # 抽烟
  652. # 1 抽过烟
  653. # 0 没有抽过烟
  654. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  655. # 喝酒
  656. # 1 喝过酒
  657. # 0 没有喝过酒
  658. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  659. 0 if x["da069"] == 1 else
  660. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  661. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  662. ,'da007_12_','da007_13_','da007_14_']
  663. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  664. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  665. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  666. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  667. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  668. diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  669. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  670. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  671. # 按 'ID' 列合并两个表
  672. health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
  673. # 使用 fillna() 来更新字段
  674. for col in columns_to_diseases_new:
  675. health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
  676. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  677. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  678. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  679. data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  680. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  681. cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  682. cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  683. cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  684. cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  685. # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  686. cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
  687. cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
  688. cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
  689. cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
  690. cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
  691. #词语记忆
  692. cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
  693. cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
  694. cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
  695. cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
  696. cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
  697. cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
  698. cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
  699. cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
  700. cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
  701. cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
  702. # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  703. cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
  704. cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
  705. cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
  706. cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
  707. cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
  708. cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
  709. cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
  710. cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
  711. cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
  712. cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
  713. # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  714. #画图
  715. cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  716. data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  717. cognition["dc001s3_score"] + cognition["dc002_score"]+ \
  718. cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  719. cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  720. cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  721. cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  722. cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  723. cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  724. cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  725. cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  726. cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
  727. cognition["draw_score"]
  728. #心理得分
  729. cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  730. cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  731. cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  732. cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  733. cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  734. cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  735. cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  736. cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  737. cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  738. cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  739. data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
  740. cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
  741. cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
  742. #睡眠状态
  743. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  744. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  745. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  746. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  747. data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
  748. #ADL
  749. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  750. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  751. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  752. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  753. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  754. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  755. data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  756. health_status["db014_score"] + health_status["db015_score"]
  757. data_2018["wave"] = year
  758. change_columns(data_2018)
  759. data_2018 = pd.concat([data_2015, data_2018], axis=0)
  760. # 2020年
  761. year = "2020"
  762. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  763. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  764. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  765. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  766. #性别#年龄#婚姻状况
  767. # 1 married or partnered
  768. # 0 other marital status (separated, divorced, unmarried, or widowed)
  769. demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
  770. #教育
  771. # 0 not finish primary school/No formal education illiterate
  772. # 1 finish primary school
  773. demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
  774. demo["education"] = demo["education"].apply(lambda x : 1 if x in [3,4,5, 6, 7, 8, 9, 10, 11] else 0 if x in [1,2] else np.nan)
  775. #合并2018年的教育
  776. eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
  777. # 按 'ID' 列合并两个表
  778. demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  779. # 使用 fillna() 来更新字段
  780. demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
  781. # 出生年
  782. demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
  783. demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
  784. #合并2018年的出生年
  785. birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
  786. # 按 'ID' 列合并两个表
  787. demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  788. # 使用 fillna() 来更新字段
  789. demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
  790. demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
  791. #获取随访时间
  792. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  793. demo["ba003"] = 1
  794. data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  795. #居住地
  796. data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
  797. #身高#体重#收缩压#舒张压
  798. data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
  799. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  800. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  801. data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  802. # 慢性病:
  803. # (1) Hypertension 高血压病
  804. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  805. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  806. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  807. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  808. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  809. # (除脂肪肝、肿瘤或癌外)
  810. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  811. # (8) Stroke 中风
  812. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  813. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  814. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  815. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  816. # (13) Arthritis or rheumatism 关节炎或风湿病
  817. # (14) Asthma 哮喘
  818. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  819. # 体力活动
  820. # 2 vigorous (vigorous activity more than once a week)
  821. # 1 moderate (moderate activity more than once a week)
  822. # 0 inactive (the rest)
  823. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else
  824. 1 if x["da032_2_"]==1 else
  825. 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2)
  826. else np.nan ,axis=1)
  827. # 抽烟
  828. # 1 抽过烟
  829. # 0 没有抽过烟
  830. health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  831. # 喝酒
  832. # 1 喝过酒
  833. # 0 没有喝过酒
  834. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else
  835. 0 if x["da051"] == 3 else np.nan, axis=1)
  836. health_status['da003_12_'] = health_status.apply(process_row, axis=1)
  837. columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
  838. ,'da003_12_','da003_14_','da003_15_']
  839. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  840. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  841. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  842. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  843. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  844. diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  845. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  846. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  847. # 按 'ID' 列合并两个表
  848. health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  849. # 使用 fillna() 来更新字段
  850. for col in columns_to_diseases_new:
  851. health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
  852. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  853. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  854. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink"]]
  855. data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  856. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  857. health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  858. health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  859. health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  860. health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  861. health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  862. health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1)
  863. health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1)
  864. health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
  865. health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
  866. health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
  867. #词语记忆
  868. health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
  869. health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
  870. health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
  871. health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1)
  872. health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1)
  873. health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)
  874. health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1)
  875. health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1)
  876. health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)
  877. health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)
  878. health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1)
  879. health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1)
  880. health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1)
  881. health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1)
  882. health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1)
  883. health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)
  884. health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1)
  885. health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1)
  886. health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)
  887. health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)
  888. #画图
  889. health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  890. data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  891. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  892. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  893. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  894. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  895. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  896. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  897. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  898. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  899. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  900. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  901. health_status["draw_score"]
  902. #心理得分
  903. health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  904. health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  905. health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  906. health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  907. health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  908. health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  909. health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  910. health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  911. health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  912. health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  913. data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  914. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  915. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  916. #睡眠状态
  917. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  918. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  919. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  920. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  921. data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
  922. #ADL
  923. health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  924. health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  925. health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  926. health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  927. health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  928. health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  929. data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  930. health_status["db014_score"] + health_status["db015_score"]
  931. data_2020["wave"] = year
  932. change_columns(data_2020)
  933. data_2020 = pd.concat([data_2018, data_2020], axis=0)
  934. #修改地区名称
  935. #省份、城市名称和污染物数据格式对齐
  936. #海东地区->海东市
  937. data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
  938. #北京 -> 北京市
  939. data_2020['city'] = data_2020['city'].replace('北京', '北京市')
  940. data_2020['province'] = data_2020['province'].replace('北京', '北京市')
  941. #哈尔滨 -> 哈尔滨市
  942. data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
  943. #天津 -> 天津市
  944. data_2020['city'] = data_2020['city'].replace('天津', '天津市')
  945. data_2020['province'] = data_2020['province'].replace('天津', '天津市')
  946. #广西省 -> 广西壮族自治区
  947. data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
  948. #巢湖市 -> 合肥市
  949. data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
  950. #襄樊市->襄阳市
  951. data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
  952. data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
  953. print(123)