CHARLS_preprocess_2.py 155 KB


  1. import pandas as pd
  2. import numpy as np
  3. import pyreadstat
  4. from datetime import date
  5. from lunarcalendar import Converter, Lunar
  6. #统一列名
  7. def change_columns(df):
  8. df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"urban_nbs","Height", "Weight",
  9. "waist", "Systolic","Diastolic", "Sit_Stand_5x", "Walking_Speed_Time",
  10. 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
  11. 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc',
  12. 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  13. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  14. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma',
  15. 'Physical_activity',
  16. 'Smoke','Drink', "Accident_Or_Injury","Fell_In_Last2Years", "Wear_Glasses"
  17. , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  18. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  19. , "Interacted_With_Friends", "Played_Ma_jong", "Provided_help", "Sport", "Community_Related_Organization", "Charity_work", "Training_course", "Other", "None"
  20. , "Internet_Usage_LastMonth", "Drink_PastYear",
  21. "Cognition_score", "Psychiatric_score","sleep_state", "ADL",
  22. 'Gas_Connection','Heating_Facility', 'Heating_Energy', 'Cooking_Fuel', "wave",
  23. ]
  24. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  25. def process_row(row):
  26. da002_12_ = row['da003_12_']
  27. da002_13_ = row['da003_13_']
  28. if da002_12_ == 1 or da002_13_ == 1:
  29. return 1
  30. elif da002_12_ == 2 and da002_13_ == 2:
  31. return 2
  32. elif (da002_12_ == 2 and pd.isna(da002_13_)) or (pd.isna(da002_12_) and da002_13_ == 2):
  33. return 2
  34. elif pd.isna(da002_12_) and pd.isna(da002_13_):
  35. return np.nan
  36. else:
  37. return np.nan # 预防万一,其余情况下设为NA
  38. def update_da051(value):
  39. if value == 1:
  40. return 3
  41. elif value == 3:
  42. return 1
  43. else:
  44. return value
  45. if __name__ == "__main__":
  46. # 2011年
  47. year = "2011"
  48. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/demographic_background.dta")
  49. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/psu.dta", encoding='gbk')
  50. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/biomarkers.dta")
  51. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood_20140429.dta")
  52. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
  53. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
  54. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  55. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
  56. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/housing_characteristics.dta")
  57. #性别#年龄#居住地#婚姻状况
  58. # 1 married or partnered
  59. # 0 other marital status (separated, divorced, unmarried, or widowed)
  60. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  61. #教育
  62. # 0 below high school
  63. # 1 high school
  64. # 2 college or above
  65. demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  66. #获取随访时间
  67. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  68. data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
  69. #居住地
  70. # 0 农村
  71. # 1 城市
  72. data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  73. #身高#体重#收缩压#舒张压
  74. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  75. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  76. #腰围
  77. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  78. #血压测量后两次的平均
  79. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  80. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  81. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  82. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  83. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  84. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  85. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  86. # 1 yes
  87. # 0 no
  88. biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan)
  89. # 步行速度时间
  90. biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2
  91. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', "waist",'Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]]
  92. data_2011 = pd.merge(data_2011, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  93. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  94. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  95. blood = blood.loc[:, blood.columns.difference(["bloodweight", "qc1_va003"])]
  96. data_2011 = pd.merge(data_2011, blood, on = ["ID"], how="left")
  97. # 慢性病:
  98. # (1) Hypertension 高血压病
  99. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  100. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  101. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  102. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  103. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  104. # (除脂肪肝、肿瘤或癌外)
  105. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  106. # (8) Stroke 中风
  107. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  108. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  109. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  110. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  111. # (13) Arthritis or rheumatism 关节炎或风湿病
  112. # (14) Asthma 哮喘
  113. # 体力活动
  114. # 2 vigorous (vigorous activity more than once a week)
  115. # 1 moderate (moderate activity more than once a week)
  116. # 0 inactive (the rest)
  117. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  118. 1 if x["da051_2_"]==1 else
  119. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  120. else np.nan ,axis=1)
  121. # 抽烟
  122. # 1 抽过烟
  123. # 0 没有抽过烟
  124. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  125. # 喝酒
  126. # 1 喝过酒
  127. # 0 没有喝过酒
  128. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  129. 0 if x["da069"] == 1 else
  130. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  131. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  132. # 1 是
  133. # 0 否
  134. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  135. 0 if x == 2 else np.nan)
  136. # 过去两年有没有摔倒?
  137. # 1 是
  138. # 0 否
  139. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  140. 0 if x == 2 else np.nan)
  141. # # 您什么时候开始来月经的?(year/age)
  142. # health_status['Menarche_Year']=health_status["da026_1"]
  143. # health_status['Menarche_Age']=health_status["da026_2"]
  144. # # 您什么时候开始绝经的?
  145. # health_status['Menopause_Year']=health_status["da028_1"]
  146. # health_status['Menopause_Age']=health_status["da028_2"]
  147. # # 第一次诊断出您有前列腺疾病是在什么时候?
  148. # health_status['Prostate_Issue_Year']=health_status["da030_1"]
  149. # health_status['Prostate_Issue_Age']=health_status["da030_2"]
  150. # 是否戴眼镜(包括矫正视力镜片)?
  151. # 1 是
  152. # 0 否
  153. # 2 失明
  154. # 3 偶尔
  155. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else np.nan)
  156. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  157. health_status['Average_Sleep_Hours']=health_status["da049"]
  158. # 过去一个月内,您通常午睡多长时间?分钟
  159. health_status['Average_Nap_Minutes']=health_status["da050"]
  160. # 您通常每周有没有至少持续做激烈活动十分钟?
  161. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  162. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  163. health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  164. # 您通常每周有没有至少持续走路十分钟?
  165. health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  166. # 您通常每周有多少天做[激烈活动]至少十分钟?
  167. health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1)
  168. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  169. health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1)
  170. # 您通常每周有多少天做[走路]至少十分钟?
  171. health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1)
  172. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  173. # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  174. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  175. # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  176. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  177. # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  178. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  179. # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  180. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  181. # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  182. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  183. # health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  184. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  185. # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  186. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  187. # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  188. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  189. # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  190. # 活动的原因
  191. # 1 工作需要
  192. # 2 娱乐
  193. # 3 体育锻炼
  194. # 4 其他
  195. # health_status[["Reason_For_Vigorous_Activity"]]= np.nan
  196. # health_status[["Reason_For_Moderate_Effort"]]= np.nan
  197. # health_status[["Reason_For_Walking"]]= np.nan
  198. # 过去一个月是否进行了下列社交活动?
  199. # (1) 串门、跟朋友交往
  200. # (2) 打麻将、下棋、打牌、去社区活动室
  201. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  202. # (4) 去公园或者其他场所跳舞、健身、练气功等
  203. # (5) 参加社团组织活动
  204. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  205. # (7) 上学或者参加培训课程
  206. # (8)其他
  207. # (9) 以上均没有
  208. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  209. health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1)
  210. health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1)
  211. health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1)
  212. health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1)
  213. health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  214. health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1)
  215. health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  216. health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1)
  217. # 过去一个月的活动频率
  218. # (1) Almost daily 差不多每天
  219. # (2) Almost every week 差不多每周
  220. # (3) Not regularly 不经常
  221. # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  222. # health_status["da057_7_"] = health_status["da057_8_"]
  223. # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  224. # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  225. # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  226. # else np.nan, axis=1)
  227. # 过去一个月,您是否上网?
  228. # 1 是
  229. # 0 否
  230. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  231. # # 使用以下哪些工具上网?
  232. # health_status[["Internet_Tools_Desktop_computer"]] = np.nan
  233. # health_status[["Internet_Tools_Laptop_computer"]] = np.nan
  234. # health_status[["Internet_Tools_Tablet_computer"]] = np.nan
  235. # health_status[["Internet_Tools_Cellphone"]] = np.nan
  236. # health_status[["Internet_Tools_Other"]] = np.nan
  237. # # 上网一般做什么?
  238. # health_status[["Internet_Purpose_Chat"]] = np.nan
  239. # health_status[["Internet_Purpose_news"]] = np.nan
  240. # health_status[["Internet_Purpose_videos"]] = np.nan
  241. # health_status[["Internet_Purpose_games"]] = np.nan
  242. # health_status[["Internet_Purpose_Financial"]] = np.nan
  243. # health_status[["Internet_Purpose_Others"]] = np.nan
  244. # # 是否会用手机支付
  245. # health_status[["Mobile_Payment"]] = np.nan
  246. # # 是否使用微信?
  247. # health_status[["Wechat_Usage"]] = np.nan
  248. # # 发不发微信朋友圈?
  249. # health_status[["Post_Moments"]] = np.nan
  250. # # 现在还在吸烟还是戒烟了?
  251. # # 1 仍然抽烟 Skip DA062 请跳过DA062
  252. # # 2 戒烟
  253. # health_status['Current_Smoking_Status']=health_status["da061"]
  254. # 吸烟时,一般抽什么烟?
  255. # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  256. # (2) Smoking self-rolled cigarettes 自己卷烟抽
  257. # (3) Filtered cigarette带滤咀香烟
  258. # (4) Unfiltered cigarette不带滤咀香烟
  259. # (5) Cigar雪茄
  260. # (6) Water cigarettes 水烟
  261. # health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1
  262. # health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2
  263. # health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3
  264. # health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4
  265. # health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5
  266. # health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6
  267. # 现在/戒烟前平均一天抽多少支香烟?
  268. # health_status['Daily_Cigarette_Count']=health_status["da063"]
  269. # 在过去的一年, 喝酒吗
  270. # (1) Drink more than once a month. 喝酒,每月超过一次
  271. # (2) Drink but less than once a month 喝酒,但每月少于一次
  272. # (3) None of these 什么都不喝
  273. health_status['Drink_PastYear']=health_status["da067"]
  274. # 过去一年内 平均一个月喝几次酒
  275. # (1)Once a month 每月一次
  276. # (2)2-3 times a month 每月2-3次
  277. # (3)Once a week 每周一次
  278. # (4)2-3 times a week 每周2-3次
  279. # (5)4-6 times a week 每周4-6次
  280. # (6)Once a day 每天一次
  281. # (7)Twice a day 一天两次
  282. # (8)More than twice a day 一天超过两次
  283. # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  284. # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  285. # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  286. # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  287. # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  288. # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  289. # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  290. # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  291. health_status_select = health_status[['ID','householdID', 'communityID', 'da007_1_', 'da007_2_','da007_3_'
  292. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  293. ,'da007_12_','da007_13_','da007_14_', "Physical_activity", "Smoke", "Drink"
  294. , "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses"
  295. , "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  296. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  297. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  298. , "Internet_Usage_LastMonth", "Drink_PastYear"]]
  299. data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  300. # 自上次访问以来的两年内,您是否发作过心脏病?
  301. # 1 是
  302. # 0 否
  303. data_2011[['Heart_attack_2_years']]=np.nan
  304. # 自上次访问以来,是否有医生诊断您中风复发?
  305. # 1 是
  306. # 0 否
  307. data_2011[['Recurrent_Stroke']]=np.nan
  308. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分
  309. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  310. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  311. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  312. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  313. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  314. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  315. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  316. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  317. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  318. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  319. #词语记忆
  320. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  321. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  322. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  323. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  324. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  325. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  326. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  327. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  328. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  329. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  330. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  331. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  332. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  333. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  334. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  335. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  336. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  337. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  338. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  339. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  340. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  341. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  342. #画图
  343. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  344. data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  345. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  346. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  347. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  348. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  349. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  350. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  351. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  352. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  353. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  354. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  355. health_status["draw_score"]
  356. #心理得分
  357. health_status["dc009_score"] = health_status["dc009"]-1
  358. health_status["dc010_score"] = health_status["dc010"]-1
  359. health_status["dc011_score"] = health_status["dc011"]-1
  360. health_status["dc012_score"] = health_status["dc012"]-1
  361. health_status["dc013_score"] = 4 - health_status["dc013"]
  362. health_status["dc014_score"] = health_status["dc014"]-1
  363. health_status["dc015_score"] = health_status["dc015"]-1
  364. health_status["dc016_score"] = 4 - health_status["dc016"]
  365. health_status["dc017_score"] = health_status["dc017"]-1
  366. health_status["dc018_score"] = health_status["dc018"]-1
  367. data_2011["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  368. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  369. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  370. #睡眠状态
  371. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  372. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  373. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  374. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  375. data_2011["sleep_state"] = health_status['dc015']
  376. #ADL
  377. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  378. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  379. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  380. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  381. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  382. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  383. data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  384. health_status["db014_score"] + health_status["db015_score"]
  385. # 是否有管道煤气或天然气?
  386. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  387. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  388. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  389. # 供暖所用的主要能源是什么?
  390. # (1)Solar 太阳能
  391. # (2)Coal 煤炭、蜂窝煤
  392. # (3)Natural gas 管道天然气或煤气
  393. # (4)Liquefied Petroleum Gas 液化石油气
  394. # (5)Electric 电
  395. # (6)Crop residue/Wood buring 秸秆、柴火
  396. # (7)Other 其他
  397. houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  398. # 做饭用的主要燃料是什么?
  399. # (1)Coal 煤炭、蜂窝煤
  400. # (2)Natural gas 管道天然气或煤气
  401. # (3)Marsh gas 沼气
  402. # (4)Liquefied Petroleum Gas 液化石油气
  403. # (5)Electric 电
  404. # (6)crop residue/Wood burning 秸秆、柴火
  405. # (7)other 其他
  406. houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  407. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  408. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  409. data_2011 = pd.merge(data_2011, houseing_select, on = ['householdID', 'communityID'], how="left")
  410. data_2011["wave"] = year
  411. change_columns(data_2011)
  412. # 2011年的ID和其他年份有一点区别,倒数第三位加0
  413. data_2011["ID"] = data_2011["ID"].apply(lambda x : x[:-2] + '0' + x[-2:] if len(str(x)) >= 3 else x)
  414. print("2011 complete")
  415. # 2013年
  416. year = "2013"
  417. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  418. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/PSU.dta", encoding='gbk')
  419. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  420. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  421. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  422. exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
  423. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
  424. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta")
  425. #性别#年龄#婚姻状况
  426. # 1 married or partnered
  427. # 0 other marital status (separated, divorced, unmarried, or widowed)
  428. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  429. #教育
  430. # 0 below high school
  431. # 1 high school
  432. # 2 college or above
  433. # 纠正2011年统计错误的教育
  434. demo["education_correct"] = demo.apply(lambda x : x["bd001_w2_3"] if x["bd001_w2_1"]==2 else np.nan, axis=1)
  435. demo["education_correct"] = demo["education_correct"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  436. education_correct = demo[['ID',"education_correct"]]
  437. # 按 'ID' 列合并两个表
  438. data_2011 = pd.merge(data_2011, education_correct, on='ID', how='left')
  439. # 使用 fillna() 来更新字段
  440. data_2011['education'] = data_2011['education_correct'].fillna(data_2011['education'])
  441. # 删除多余的列
  442. data_2011 = data_2011.drop(columns=['education_correct'])
  443. #更新2013的教育
  444. demo["education"] = demo.apply(lambda x : x["bd001"] if pd.isna(x["bd001_w2_1"]) else x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  445. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  446. #合并2011年的教育
  447. eductaion_2011 = data_2011[['ID',"education"]]
  448. # 按 'ID' 列合并两个表
  449. demo = pd.merge(demo, eductaion_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  450. # 使用 fillna() 来更新字段
  451. demo['education'] = demo['education_2013'].fillna(demo['education_2011'])
  452. # 纠正2011年统计错误的出生年
  453. demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
  454. demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
  455. birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
  456. # 按 'ID' 列合并两个表
  457. data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
  458. # 使用 fillna() 来更新字段
  459. data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
  460. data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
  461. # 删除多余的列
  462. data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
  463. #合并2011年的出生年
  464. birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
  465. # 按 'ID' 列合并两个表
  466. demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  467. # 使用 fillna() 来更新字段
  468. demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
  469. demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
  470. #获取随访时间
  471. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  472. data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
  473. #居住地
  474. # 0 农村
  475. # 1 城市
  476. data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  477. #身高#体重#收缩压#舒张压
  478. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  479. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  480. #腰围
  481. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  482. #血压测量后两次的平均
  483. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  484. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  485. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  486. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  487. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  488. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  489. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  490. # 1 yes
  491. # 0 no
  492. biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan)
  493. # 步行速度时间
  494. biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2
  495. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002','ql002', 'waist','Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]]
  496. data_2013 = pd.merge(data_2013, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  497. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  498. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  499. data_2013[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  500. # 慢性病:
  501. # (1) Hypertension 高血压病
  502. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  503. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  504. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  505. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  506. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  507. # (除脂肪肝、肿瘤或癌外)
  508. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  509. # (8) Stroke 中风
  510. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  511. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  512. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  513. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  514. # (13) Arthritis or rheumatism 关节炎或风湿病
  515. # (14) Asthma 哮喘
  516. # 体力活动
  517. # 2 vigorous (vigorous activity more than once a week)
  518. # 1 moderate (moderate activity more than once a week)
  519. # 0 inactive (the rest)
  520. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  521. 1 if x["da051_2_"]==1 else
  522. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  523. else np.nan ,axis=1)
  524. # 抽烟
  525. # 1 抽过烟
  526. # 0 没有抽过烟
  527. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  528. # 喝酒
  529. # 1 喝过酒
  530. # 0 没有喝过酒
  531. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  532. 0 if x["da069"] == 1 else
  533. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  534. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  535. # 1 是
  536. # 0 否
  537. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  538. 0 if x == 2 else np.nan)
  539. # 过去两年有没有摔倒?
  540. # 1 是
  541. # 0 否
  542. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  543. 0 if x == 2 else np.nan)
  544. # # 您什么时候开始来月经的?(year/age)
  545. # health_status['Menarche_Year']=health_status["da026_1"]
  546. # health_status['Menarche_Age']=health_status["da026_2"]
  547. # # 您什么时候开始绝经的?
  548. # health_status['Menopause_Year']=health_status["da028_1"]
  549. # health_status['Menopause_Age']=health_status["da028_2"]
  550. # # 第一次诊断出您有前列腺疾病是在什么时候?
  551. # health_status['Prostate_Issue_Year']=health_status["da030_1"]
  552. # health_status['Prostate_Issue_Age']=health_status["da030_2"]
  553. # 是否戴眼镜(包括矫正视力镜片)?
  554. # 1 是
  555. # 0 否
  556. # 2 失明
  557. # 3 偶尔
  558. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  559. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  560. health_status['Average_Sleep_Hours']=health_status["da049"]
  561. # 过去一个月内,您通常午睡多长时间?分钟
  562. health_status['Average_Nap_Minutes']=health_status["da050"]
  563. # 您通常每周有没有至少持续做激烈活动十分钟?
  564. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  565. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  566. health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  567. # 您通常每周有没有至少持续走路十分钟?
  568. health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  569. # 您通常每周有多少天做[激烈活动]至少十分钟?
  570. health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1)
  571. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  572. health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1)
  573. # 您通常每周有多少天做[走路]至少十分钟?
  574. health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1)
  575. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  576. # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  577. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  578. # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  579. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  580. # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  581. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  582. # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  583. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  584. # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  585. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  586. # health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  587. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  588. # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  589. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  590. # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  591. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  592. # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  593. # 活动的原因
  594. # 1 工作需要
  595. # 2 娱乐
  596. # 3 体育锻炼
  597. # 4 其他
  598. # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  599. # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  600. # health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  601. # 过去一个月是否进行了下列社交活动?
  602. # (1) 串门、跟朋友交往
  603. # (2) 打麻将、下棋、打牌、去社区活动室
  604. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  605. # (4) 去公园或者其他场所跳舞、健身、练气功等
  606. # (5) 参加社团组织活动
  607. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  608. # (7) 上学或者参加培训课程
  609. # (8)其他
  610. # (9) 以上均没有
  611. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  612. health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1)
  613. health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1)
  614. health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1)
  615. health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1)
  616. health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  617. health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1)
  618. health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  619. health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1)
  620. # 过去一个月的活动频率
  621. # (1) Almost daily 差不多每天
  622. # (2) Almost every week 差不多每周
  623. # (3) Not regularly 不经常
  624. # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  625. # health_status["da057_7_"] = health_status["da057_8_"]
  626. # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  627. # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  628. # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  629. # else np.nan, axis=1)
  630. # 过去一个月,您是否上网?
  631. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  632. # # 使用以下哪些工具上网?
  633. # health_status[["Internet_Tools_Desktop_computer"]] = np.nan
  634. # health_status[["Internet_Tools_Laptop_computer"]] = np.nan
  635. # health_status[["Internet_Tools_Tablet_computer"]] = np.nan
  636. # health_status[["Internet_Tools_Cellphone"]] = np.nan
  637. # health_status[["Internet_Tools_Other"]] = np.nan
  638. # # 上网一般做什么?
  639. # health_status[["Internet_Purpose_Chat"]] = np.nan
  640. # health_status[["Internet_Purpose_news"]] = np.nan
  641. # health_status[["Internet_Purpose_videos"]] = np.nan
  642. # health_status[["Internet_Purpose_games"]] = np.nan
  643. # health_status[["Internet_Purpose_Financial"]] = np.nan
  644. # health_status[["Internet_Purpose_Others"]] = np.nan
  645. # # 是否会用手机支付
  646. # health_status[["Mobile_Payment"]] = np.nan
  647. # # 是否使用微信?
  648. # health_status[["Wechat_Usage"]] = np.nan
  649. # # 发不发微信朋友圈?
  650. # health_status[["Post_Moments"]] = np.nan
  651. # # 现在还在吸烟还是戒烟了?
  652. # # 1 仍然抽烟 Skip DA062 请跳过DA062
  653. # # 2 戒烟
  654. # health_status['Current_Smoking_Status']=health_status["da061"]
  655. # # 吸烟时,一般抽什么烟?
  656. # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  657. # # (2) Smoking self-rolled cigarettes 自己卷烟抽
  658. # # (3) Filtered cigarette带滤咀香烟
  659. # # (4) Unfiltered cigarette不带滤咀香烟
  660. # # (5) Cigar雪茄
  661. # # (6) Water cigarettes 水烟
  662. # health_status[['Smoking_Type_pipe']]=np.nan
  663. # health_status[['Smoking_Type_rolled']]=np.nan
  664. # health_status[['Smoking_Type_Filtered']]=np.nan
  665. # health_status[['Smoking_Type_Unfiltered']]=np.nan
  666. # health_status[['Smoking_Type_Cigar']]=np.nan
  667. # health_status[['Smoking_Type_Water']]=np.nan
  668. # # 现在/戒烟前平均一天抽多少支香烟?
  669. # health_status['Daily_Cigarette_Count']=health_status["da063"]
  670. # 在过去的一年, 喝酒吗
  671. # (1) Drink more than once a month. 喝酒,每月超过一次
  672. # (2) Drink but less than once a month 喝酒,但每月少于一次
  673. # (3) None of these 什么都不喝
  674. health_status['Drink_PastYear']=health_status["da067"]
  675. # 过去一年内 平均一个月喝几次酒
  676. # (1)Once a month 每月一次
  677. # (2)2-3 times a month 每月2-3次
  678. # (3)Once a week 每周一次
  679. # (4)2-3 times a week 每周2-3次
  680. # (5)4-6 times a week 每周4-6次
  681. # (6)Once a day 每天一次
  682. # (7)Twice a day 一天两次
  683. # (8)More than twice a day 一天超过两次
  684. # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  685. # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  686. # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  687. # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  688. # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  689. # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  690. # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  691. # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  692. # 自上次访问以来的两年内,您是否发作过心脏病?
  693. # 1 是
  694. # 0 否
  695. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  696. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  697. # 自上次访问以来,是否有医生诊断您中风复发?
  698. # 1 是
  699. # 0 否
  700. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  701. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  702. # 合并2011年的慢性病
  703. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  704. ,'da007_12_','da007_13_','da007_14_']
  705. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  706. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  707. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  708. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  709. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  710. diseases_2011 = data_2011[['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  711. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  712. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  713. # 按 'ID' 列合并两个表
  714. health_status = pd.merge(health_status, diseases_2011, on='ID', how='left', suffixes=("_2013","_2011"))
  715. # 使用 fillna() 来更新字段
  716. for col in columns_to_diseases_new:
  717. health_status[col] = health_status[f'{col}_2013'].fillna(health_status[f'{col}_2011'])
  718. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  719. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  720. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" ,
  721. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  722. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  723. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  724. , "Internet_Usage_LastMonth", "Drink_PastYear"]]
  725. data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  726. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  727. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  728. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  729. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  730. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  731. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  732. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  733. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  734. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  735. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  736. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  737. #词语记忆
  738. health_status["dc006s1_score"] = health_status["dc006_1_s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  739. health_status["dc006s2_score"] = health_status["dc006_1_s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  740. health_status["dc006s3_score"] = health_status["dc006_1_s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  741. health_status["dc006s4_score"] = health_status["dc006_1_s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  742. health_status["dc006s5_score"] = health_status["dc006_1_s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  743. health_status["dc006s6_score"] = health_status["dc006_1_s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  744. health_status["dc006s7_score"] = health_status["dc006_1_s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  745. health_status["dc006s8_score"] = health_status["dc006_1_s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  746. health_status["dc006s9_score"] = health_status["dc006_1_s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  747. health_status["dc006s10_score"] = health_status["dc006_1_s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  748. # health_status["dc006s11_score"] = health_status["dc006_1_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  749. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  750. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  751. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  752. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  753. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  754. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  755. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  756. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  757. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  758. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  759. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  760. #画图
  761. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  762. data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  763. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  764. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  765. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  766. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  767. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  768. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  769. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  770. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  771. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  772. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  773. health_status["draw_score"]
  774. #心理得分
  775. health_status["dc009_score"] = health_status["dc009"]-1
  776. health_status["dc010_score"] = health_status["dc010"]-1
  777. health_status["dc011_score"] = health_status["dc011"]-1
  778. health_status["dc012_score"] = health_status["dc012"]-1
  779. health_status["dc013_score"] = 4 - health_status["dc013"]
  780. health_status["dc014_score"] = health_status["dc014"]-1
  781. health_status["dc015_score"] = health_status["dc015"]-1
  782. health_status["dc016_score"] = 4 - health_status["dc016"]
  783. health_status["dc017_score"] = health_status["dc017"]-1
  784. health_status["dc018_score"] = health_status["dc018"]-1
  785. data_2013["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  786. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  787. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  788. #睡眠状态
  789. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  790. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  791. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  792. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  793. data_2013["sleep_state"] = health_status['dc015']
  794. #ADL
  795. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  796. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  797. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  798. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  799. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  800. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  801. data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  802. health_status["db014_score"] + health_status["db015_score"]
  803. # 是否有管道煤气或天然气?
  804. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  805. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  806. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  807. # 供暖所用的主要能源是什么?
  808. # (1)Solar 太阳能
  809. # (2)Coal 煤炭、蜂窝煤
  810. # (3)Natural gas 管道天然气或煤气
  811. # (4)Liquefied Petroleum Gas 液化石油气
  812. # (5)Electric 电
  813. # (6)Crop residue/Wood buring 秸秆、柴火
  814. # (7)Other 其他
  815. houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  816. # 做饭用的主要燃料是什么?
  817. # (1)Coal 煤炭、蜂窝煤
  818. # (2)Natural gas 管道天然气或煤气
  819. # (3)Marsh gas 沼气
  820. # (4)Liquefied Petroleum Gas 液化石油气
  821. # (5)Electric 电
  822. # (6)crop residue/Wood burning 秸秆、柴火
  823. # (7)other 其他
  824. houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  825. houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection',
  826. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  827. data_2013 = pd.merge(data_2013, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left")
  828. data_2013["wave"] = year
  829. change_columns(data_2013)
  830. data_2013 = pd.concat([data_2011, data_2013], axis=0)
  831. print("2013 complete")
  832. # 2015年
  833. year = "2015"
  834. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  835. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  836. blood, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Blood.dta")
  837. biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
  838. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  839. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  840. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  841. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing_Characteristics.dta")
  842. #性别#年龄#婚姻状况
  843. # 1 married or partnered
  844. # 0 other marital status (separated, divorced, unmarried, or widowed)
  845. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be001"]==7 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  846. #教育
  847. # 0 below high school
  848. # 1 high school
  849. # 2 college or above
  850. #更新2015的教育
  851. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) and not x["bd001_w2_4"]==12 else np.nan, axis=1)
  852. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  853. #合并2013年的教育
  854. eductaion_2013 = data_2013[data_2013["wave"]=="2013"][['ID',"education"]]
  855. # 按 'ID' 列合并两个表
  856. demo = pd.merge(demo, eductaion_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  857. # 使用 fillna() 来更新字段
  858. demo['education'] = demo['education_2015'].fillna(demo['education_2013'])
  859. # 2015年的出生年
  860. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
  861. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
  862. #获取随访时间
  863. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  864. data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  865. #居住地
  866. # 0 农村
  867. # 1 城市
  868. data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  869. #身高#体重#收缩压#舒张压
  870. biomarkers["qi002"] = biomarkers["qi002"].apply(lambda x : np.nan if x >210 else x)
  871. biomarkers["ql002"] = biomarkers["ql002"].apply(lambda x : np.nan if x >150 else x)
  872. #腰围
  873. biomarkers['waist'] = biomarkers["qm002"].apply(lambda x : np.nan if x >210 else x)
  874. #血压测量后两次的平均
  875. biomarkers["qa007"] = biomarkers["qa007"].apply(lambda x : np.nan if x >300 else x)
  876. biomarkers["qa011"] = biomarkers["qa011"].apply(lambda x : np.nan if x >300 else x)
  877. biomarkers["qa008"] = biomarkers["qa008"].apply(lambda x : np.nan if x >150 else x)
  878. biomarkers["qa012"] = biomarkers["qa012"].apply(lambda x : np.nan if x >150 else x)
  879. biomarkers["Systolic"] = (biomarkers["qa007"] + biomarkers["qa011"]) /2
  880. biomarkers["Diastolic"] = (biomarkers["qa008"] + biomarkers["qa012"]) /2
  881. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  882. # 1 yes
  883. # 0 no
  884. biomarkers["Sit_Stand_5x"] = biomarkers["qh002"].apply(lambda x : 1 if x == 1 else 0 if x == 5 else np.nan)
  885. # 步行速度时间
  886. biomarkers["Walking_Speed_Time"] = (biomarkers["qg002"] + biomarkers["qg003"]) /2
  887. #身高#体重#收缩压#舒张压
  888. biomarkers_select = biomarkers[['ID','householdID', 'communityID','qi002', 'ql002', 'waist', 'Systolic','Diastolic', "Sit_Stand_5x", "Walking_Speed_Time"]]
  889. data_2015 = pd.merge(data_2015, biomarkers_select, on = ["ID", "householdID", "communityID"], how="left")
  890. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  891. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  892. blood = blood[['ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]
  893. data_2015 = pd.merge(data_2015, blood, on = ["ID"], how="left")
  894. # 慢性病:
  895. # (1) Hypertension 高血压病
  896. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  897. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  898. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  899. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  900. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  901. # (除脂肪肝、肿瘤或癌外)
  902. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  903. # (8) Stroke 中风
  904. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  905. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  906. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  907. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  908. # (13) Arthritis or rheumatism 关节炎或风湿病
  909. # (14) Asthma 哮喘
  910. # 体力活动
  911. # 2 vigorous (vigorous activity more than once a week)
  912. # 1 moderate (moderate activity more than once a week)
  913. # 0 inactive (the rest)
  914. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  915. 1 if x["da051_2_"]==1 else
  916. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  917. else np.nan ,axis=1)
  918. # 抽烟
  919. # 1 抽过烟
  920. # 0 没有抽过烟
  921. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  922. # 喝酒
  923. # 1 喝过酒
  924. # 0 没有喝过酒
  925. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  926. 0 if x["da069"] == 1 else
  927. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  928. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  929. # 1 是
  930. # 0 否
  931. health_status['Accident_Or_Injury']=health_status["da021"].apply(lambda x : 1 if x ==1 else
  932. 0 if x == 2 else np.nan)
  933. # 过去两年有没有摔倒?
  934. # 1 是
  935. # 0 否
  936. health_status['Fell_In_Last2Years']=health_status["da023"].apply(lambda x : 1 if x ==1 else
  937. 0 if x == 2 else np.nan)
  938. # # 您什么时候开始来月经的?(year/age)
  939. # health_status['Menarche_Year']=health_status["da026_1"]
  940. # health_status['Menarche_Age']=health_status["da026_2"]
  941. # # 您什么时候开始绝经的?
  942. # health_status['Menopause_Year']=health_status["da028_1"]
  943. # health_status['Menopause_Age']=health_status["da028_2"]
  944. # # 第一次诊断出您有前列腺疾病是在什么时候?
  945. # health_status['Prostate_Issue_Year']=health_status["da030_1"]
  946. # health_status['Prostate_Issue_Age']=health_status["da030_2"]
  947. # 是否戴眼镜(包括矫正视力镜片)?
  948. # 1 是
  949. # 0 否
  950. # 2 失明
  951. # 3 偶尔
  952. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  953. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  954. health_status['Average_Sleep_Hours']=health_status["da049"]
  955. # 过去一个月内,您通常午睡多长时间?分钟
  956. health_status['Average_Nap_Minutes']=health_status["da050"]
  957. # 您通常每周有没有至少持续做激烈活动十分钟?
  958. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  959. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  960. health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  961. # 您通常每周有没有至少持续走路十分钟?
  962. health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  963. # 您通常每周有多少天做[激烈活动]至少十分钟?
  964. health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1)
  965. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  966. health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1)
  967. # 您通常每周有多少天做[走路]至少十分钟?
  968. health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1)
  969. # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  970. # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  971. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  972. # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  973. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  974. # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  975. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  976. # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  977. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  978. # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  979. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  980. # health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  981. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  982. # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  983. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  984. # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  985. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  986. # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  987. # 活动的原因
  988. # 1 工作需要
  989. # 2 娱乐
  990. # 3 体育锻炼
  991. # 4 其他
  992. # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  993. # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  994. # health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  995. # 过去一个月是否进行了下列社交活动?
  996. # (1) 串门、跟朋友交往
  997. # (2) 打麻将、下棋、打牌、去社区活动室
  998. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  999. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1000. # (5) 参加社团组织活动
  1001. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1002. # (7) 上学或者参加培训课程
  1003. # (8)其他
  1004. # (9) 以上均没有
  1005. health_status["da056s1"] = health_status.apply(lambda x: 1 if x["da056s1"]==1 else 0, axis=1)
  1006. health_status["da056s2"] = health_status.apply(lambda x: 1 if x["da056s2"]==2 else 0, axis=1)
  1007. health_status["da056s3"] = health_status.apply(lambda x: 1 if x["da056s3"]==3 else 0, axis=1)
  1008. health_status["da056s4"] = health_status.apply(lambda x: 1 if x["da056s4"]==4 else 0, axis=1)
  1009. health_status["da056s5"] = health_status.apply(lambda x: 1 if x["da056s5"]==5 else 0, axis=1)
  1010. health_status["da056s6"] = health_status.apply(lambda x: 1 if x["da056s6"]==6 or x["da056s7"]==7 else 0, axis=1)
  1011. health_status["da056s7"] = health_status.apply(lambda x: 1 if x["da056s8"]==8 else 0, axis=1)
  1012. health_status["da056s8"] = health_status.apply(lambda x: 1 if x["da056s9"]==9 or x["da056s10"]==10 or x["da056s11"]==11 else 0, axis=1)
  1013. health_status["da056s9"] = health_status.apply(lambda x: 1 if x["da056s12"]==12 else 0, axis=1)
  1014. # 过去一个月的活动频率
  1015. # (1) Almost daily 差不多每天
  1016. # (2) Almost every week 差不多每周
  1017. # (3) Not regularly 不经常
  1018. # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  1019. # health_status["da057_7_"] = health_status["da057_8_"]
  1020. # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  1021. # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  1022. # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  1023. # else np.nan, axis=1)
  1024. # 过去一个月,您是否上网?
  1025. health_status["Internet_Usage_LastMonth"] = health_status["da056s10"].apply(lambda x : 1 if x==10 else 0)
  1026. # # 使用以下哪些工具上网?
  1027. # # 1. Desktop computer 台式电脑
  1028. # # 2. Laptop computer 笔记本电脑
  1029. # # 3. Tablet computer 平板电脑(如 IPAD)
  1030. # # 4. Cellphone 手机
  1031. # # 5. Other devices 其他设备
  1032. # health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3s1"]
  1033. # health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3s2"]
  1034. # health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3s3"]
  1035. # health_status["Internet_Tools_Cellphone"] = health_status["da056_w3s4"]
  1036. # health_status["Internet_Tools_Other"] = health_status["da056_w3s5"]
  1037. # # 上网一般做什么?
  1038. # health_status[["Internet_Purpose_Chat"]] = np.nan
  1039. # health_status[["Internet_Purpose_news"]] = np.nan
  1040. # health_status[["Internet_Purpose_videos"]] = np.nan
  1041. # health_status[["Internet_Purpose_games"]] = np.nan
  1042. # health_status[["Internet_Purpose_Financial"]] = np.nan
  1043. # health_status[["Internet_Purpose_Others"]] = np.nan
  1044. # # 是否会用手机支付
  1045. # health_status[["Mobile_Payment"]] = np.nan
  1046. # # 是否使用微信?
  1047. # health_status[["Wechat_Usage"]] = np.nan
  1048. # # 发不发微信朋友圈?
  1049. # health_status[["Post_Moments"]] = np.nan
  1050. # # 现在还在吸烟还是戒烟了?
  1051. # # 1 仍然抽烟
  1052. # # 2 戒烟
  1053. # health_status['Current_Smoking_Status']=health_status["da061"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else np.nan)
  1054. # # 更新一下2013年没有回答的用户
  1055. # health_status["da061_w3"] = health_status["da061_w3"].apply(lambda x : 1 if x ==1 else 2 if x ==2 else np.nan)
  1056. # data_2013 = pd.merge(data_2013, health_status[['ID',"da061_w3"]], on='ID', how='left')
  1057. # data_2013['Current_Smoking_Status'] = data_2013['Current_Smoking_Status'].fillna(data_2013['da061_w3'])
  1058. # data_2013 = data_2013.drop('da061_w3', axis=1)
  1059. # # 按 'ID' 列合并两个表
  1060. # # 吸烟时,一般抽什么烟?
  1061. # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1062. # # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1063. # # (3) Filtered cigarette带滤咀香烟
  1064. # # (4) Unfiltered cigarette不带滤咀香烟
  1065. # # (5) Cigar雪茄
  1066. # # (6) Water cigarettes 水烟
  1067. # health_status['Smoking_Type_pipe']=health_status["da060s1"]
  1068. # health_status['Smoking_Type_rolled']=health_status["da060s2"]
  1069. # health_status['Smoking_Type_Filtered']=health_status["da060s3"]
  1070. # health_status['Smoking_Type_Unfiltered']=health_status["da060s4"]
  1071. # health_status['Smoking_Type_Cigar']=health_status["da060s5"]
  1072. # health_status['Smoking_Type_Water']=health_status["da060s6"]
  1073. # # 现在/戒烟前平均一天抽多少支香烟?
  1074. # health_status['Daily_Cigarette_Count']=health_status["da063"]
  1075. # 在过去的一年, 喝酒吗
  1076. # (1) Drink more than once a month. 喝酒,每月超过一次
  1077. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1078. # (3) None of these 什么都不喝
  1079. health_status['Drink_PastYear']=health_status["da067"]
  1080. # 过去一年内 平均一个月喝几次酒
  1081. # (1)Once a month 每月一次
  1082. # (2)2-3 times a month 每月2-3次
  1083. # (3)Once a week 每周一次
  1084. # (4)2-3 times a week 每周2-3次
  1085. # (5)4-6 times a week 每周4-6次
  1086. # (6)Once a day 每天一次
  1087. # (7)Twice a day 一天两次
  1088. # (8)More than twice a day 一天超过两次
  1089. # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  1090. # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  1091. # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  1092. # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  1093. # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  1094. # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  1095. # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  1096. # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  1097. # 自上次访问以来的两年内,您是否发作过心脏病?
  1098. # 1 是
  1099. # 0 否
  1100. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  1101. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  1102. # 自上次访问以来,是否有医生诊断您中风复发?
  1103. # 1 是
  1104. # 0 否
  1105. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  1106. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  1107. # 合并2013年的慢性病
  1108. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  1109. ,'da007_12_','da007_13_','da007_14_']
  1110. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1111. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1112. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1113. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1114. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1115. diseases_2013 = data_2013[data_2013["wave"]=="2013"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1116. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1117. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1118. # 按 'ID' 列合并两个表
  1119. health_status = pd.merge(health_status, diseases_2013, on='ID', how='left', suffixes=("_2015","_2013"))
  1120. # 使用 fillna() 来更新字段
  1121. for col in columns_to_diseases_new:
  1122. health_status[col] = health_status[f'{col}_2015'].fillna(health_status[f'{col}_2013'])
  1123. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1124. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1125. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" ,
  1126. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1127. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  1128. , "da056s1", "da056s2", "da056s3", "da056s4", "da056s5", "da056s6", "da056s7", "da056s8", "da056s9"
  1129. , "Internet_Usage_LastMonth", "Drink_PastYear"]]
  1130. data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1131. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1132. health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1133. health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1134. health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1135. health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1136. # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1137. health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0)
  1138. health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0)
  1139. health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
  1140. health_status["dc022_score"] = health_status["dc022"].apply(lambda x : 1 if x==72 else 0 if pd.isna(x) else 0)
  1141. health_status["dc023_score"] = health_status["dc023"].apply(lambda x : 1 if x==65 else 0 if pd.isna(x) else 0)
  1142. #词语记忆
  1143. health_status["dc006s1_score"] = health_status["dc006s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1144. health_status["dc006s2_score"] = health_status["dc006s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1145. health_status["dc006s3_score"] = health_status["dc006s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1146. health_status["dc006s4_score"] = health_status["dc006s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  1147. health_status["dc006s5_score"] = health_status["dc006s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  1148. health_status["dc006s6_score"] = health_status["dc006s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  1149. health_status["dc006s7_score"] = health_status["dc006s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  1150. health_status["dc006s8_score"] = health_status["dc006s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  1151. health_status["dc006s9_score"] = health_status["dc006s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  1152. health_status["dc006s10_score"] = health_status["dc006s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  1153. # health_status["dc006s11_score"] = health_status["dc006s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1154. health_status["dc027s1_score"] = health_status["dc027s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
  1155. health_status["dc027s2_score"] = health_status["dc027s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
  1156. health_status["dc027s3_score"] = health_status["dc027s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
  1157. health_status["dc027s4_score"] = health_status["dc027s4"].apply(lambda x : 1 if x==4 else 0 if pd.isna(x) else 0)
  1158. health_status["dc027s5_score"] = health_status["dc027s5"].apply(lambda x : 1 if x==5 else 0 if pd.isna(x) else 0)
  1159. health_status["dc027s6_score"] = health_status["dc027s6"].apply(lambda x : 1 if x==6 else 0 if pd.isna(x) else 0)
  1160. health_status["dc027s7_score"] = health_status["dc027s7"].apply(lambda x : 1 if x==7 else 0 if pd.isna(x) else 0)
  1161. health_status["dc027s8_score"] = health_status["dc027s8"].apply(lambda x : 1 if x==8 else 0 if pd.isna(x) else 0)
  1162. health_status["dc027s9_score"] = health_status["dc027s9"].apply(lambda x : 1 if x==9 else 0 if pd.isna(x) else 0)
  1163. health_status["dc027s10_score"] = health_status["dc027s10"].apply(lambda x : 1 if x==10 else 0 if pd.isna(x) else 0)
  1164. # health_status["dc027s11_score"] = health_status["dc027s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1165. #画图
  1166. health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1167. data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  1168. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  1169. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  1170. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  1171. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  1172. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  1173. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  1174. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  1175. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  1176. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  1177. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  1178. health_status["draw_score"]
  1179. #心理得分
  1180. health_status["dc009_score"] = health_status["dc009"]-1
  1181. health_status["dc010_score"] = health_status["dc010"]-1
  1182. health_status["dc011_score"] = health_status["dc011"]-1
  1183. health_status["dc012_score"] = health_status["dc012"]-1
  1184. health_status["dc013_score"] = 4 - health_status["dc013"]
  1185. health_status["dc014_score"] = health_status["dc014"]-1
  1186. health_status["dc015_score"] = health_status["dc015"]-1
  1187. health_status["dc016_score"] = 4 - health_status["dc016"]
  1188. health_status["dc017_score"] = health_status["dc017"]-1
  1189. health_status["dc018_score"] = health_status["dc018"]-1
  1190. data_2015["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  1191. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  1192. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  1193. #睡眠状态
  1194. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  1195. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  1196. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  1197. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  1198. data_2015["sleep_state"] = health_status['dc015']
  1199. #ADL
  1200. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1201. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1202. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1203. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1204. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1205. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1206. data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  1207. health_status["db014_score"] + health_status["db015_score"]
  1208. # 是否有管道煤气或天然气?
  1209. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1210. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  1211. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1212. # 供暖所用的主要能源是什么?
  1213. # (1)Solar 太阳能
  1214. # (2)Coal 煤炭、蜂窝煤
  1215. # (3)Natural gas 管道天然气或煤气
  1216. # (4)Liquefied Petroleum Gas 液化石油气
  1217. # (5)Electric 电
  1218. # (6)Crop residue/Wood buring 秸秆、柴火
  1219. # (7)Other 其他
  1220. houseing["Heating_Energy"] = houseing["i021"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  1221. # 做饭用的主要燃料是什么?
  1222. # (1)Coal 煤炭、蜂窝煤
  1223. # (2)Natural gas 管道天然气或煤气
  1224. # (3)Marsh gas 沼气
  1225. # (4)Liquefied Petroleum Gas 液化石油气
  1226. # (5)Electric 电
  1227. # (6)crop residue/Wood burning 秸秆、柴火
  1228. # (7)other 其他
  1229. houseing["Cooking_Fuel"] = houseing["i022"].apply(lambda x : 0 if x ==7 else x if not pd.isna(x) else np.nan)
  1230. houseing_select = houseing[['ID','householdID', 'communityID','Gas_Connection',
  1231. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  1232. data_2015 = pd.merge(data_2015, houseing_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1233. data_2015["wave"] = year
  1234. change_columns(data_2015)
  1235. data_2015 = pd.concat([data_2013, data_2015], axis=0)
  1236. print("2015 complete")
  1237. # 2018年
  1238. year = "2018"
  1239. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  1240. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  1241. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  1242. health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
  1243. cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  1244. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  1245. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Housing.dta")
  1246. #性别#年龄#婚姻状况
  1247. # 1 married or partnered
  1248. # 0 other marital status (separated, divorced, unmarried, or widowed)
  1249. demo["marital_status"] = demo.apply(lambda x : 1 if x["be001"]==1 or x["be001"]==2 or x["be002"]==1 else 0 if x["be001"] in [3,4,5,6] else np.nan, axis=1)
  1250. #教育
  1251. # 0 below high school
  1252. # 1 high school
  1253. # 2 college or above
  1254. #更新2015的教育
  1255. demo["education"] = demo.apply(lambda x : x["bd001_w2_4"] if not pd.isna(x["bd001_w2_4"]) else np.nan, axis=1)
  1256. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  1257. # 出生年
  1258. demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
  1259. demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
  1260. #获取随访时间
  1261. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  1262. data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  1263. #居住地
  1264. # 0 农村
  1265. # 1 城市
  1266. data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  1267. #身高#体重#腰围#收缩压#舒张压
  1268. data_2018[['qi002', 'ql002', 'waist','qa011' ,'qa012']]=np.nan
  1269. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  1270. data_2018[["Sit_Stand_5x", "Walking_Speed_Time"]] = np.nan
  1271. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  1272. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  1273. data_2018[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  1274. # 慢性病:
  1275. # (1) Hypertension 高血压病
  1276. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  1277. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  1278. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  1279. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  1280. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  1281. # (除脂肪肝、肿瘤或癌外)
  1282. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  1283. # (8) Stroke 中风
  1284. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  1285. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  1286. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  1287. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  1288. # (13) Arthritis or rheumatism 关节炎或风湿病
  1289. # (14) Asthma 哮喘
  1290. # 体力活动
  1291. # 2 vigorous (vigorous activity more than once a week)
  1292. # 1 moderate (moderate activity more than once a week)
  1293. # 0 inactive (the rest)
  1294. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da051_1_"]==1 else
  1295. 1 if x["da051_2_"]==1 else
  1296. 0 if x["da051_3_"] == 1 or (x["da051_1_"]==2 and x["da051_2_"]==2 and x["da051_3_"] == 2)
  1297. else np.nan ,axis=1)
  1298. # 抽烟
  1299. # 1 抽过烟
  1300. # 0 没有抽过烟
  1301. health_status["Smoke"] = health_status["da059"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  1302. # 喝酒
  1303. # 1 喝过酒
  1304. # 0 没有喝过酒
  1305. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da067"] ==1 or x["da067"] ==2 else
  1306. 0 if x["da069"] == 1 else
  1307. 1 if x["da069"] == 2 or x["da069"] == 3 else np.nan, axis=1)
  1308. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  1309. # 1 是
  1310. # 0 否
  1311. health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if (not pd.isna(x["da021"]) and x["da021"]==1) or (pd.isna(x["da021"]) and not pd.isna(x["da022"]) )else
  1312. 0 if (not pd.isna(x["da021"]) and x["da021"]==2) or (pd.isna(x["da021"]) and pd.isna(x["da022"]) ) else np.nan, axis=1)
  1313. # 过去两年有没有摔倒?
  1314. # 1 是
  1315. # 0 否
  1316. health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da023"] ==1 or x["da023_w4"]==1 else
  1317. 0 if x["da023"] ==2 or x["da023_w4"]==2 else np.nan, axis=1)
  1318. # # 您什么时候开始来月经的?(year/age)
  1319. # health_status['Menarche_Year']=health_status["da026_1"]
  1320. # health_status['Menarche_Age']=health_status["da026_2"]
  1321. # # 您什么时候开始绝经的?
  1322. # health_status['Menopause_Year']=health_status["da028_1"]
  1323. # health_status['Menopause_Age']=health_status["da028_2"]
  1324. # # 第一次诊断出您有前列腺疾病是在什么时候?
  1325. # health_status['Prostate_Issue_Year']=health_status["da030_1"]
  1326. # health_status['Prostate_Issue_Age']=health_status["da030_2"]
  1327. # 是否戴眼镜(包括矫正视力镜片)?
  1328. # 1 是
  1329. # 0 否
  1330. # 2 失明
  1331. # 3 偶尔
  1332. health_status['Wear_Glasses']=health_status["da032"].apply(lambda x : 1 if x == 1 else 2 if x ==2 else 0 if x == 3 else 3 if x == 4 else np.nan)
  1333. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  1334. health_status['Average_Sleep_Hours']=health_status["da049"]
  1335. # 过去一个月内,您通常午睡多长时间?分钟
  1336. health_status['Average_Nap_Minutes']=health_status["da050"]
  1337. # 您通常每周有没有至少持续做激烈活动十分钟?
  1338. health_status['Vigorous_Activity_10Min']=health_status["da051_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1339. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  1340. health_status['Moderate_Effort_10Min']=health_status["da051_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1341. # 您通常每周有没有至少持续走路十分钟?
  1342. health_status['Walking_10Min']=health_status["da051_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1343. # 您通常每周有多少天做[激烈活动]至少十分钟?
  1344. health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da052_1_"]) else x["da052_1_"], axis=1)
  1345. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  1346. health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da052_2_"]) else x["da052_2_"], axis=1)
  1347. # 您通常每周有多少天做[走路]至少十分钟?
  1348. health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da052_3_"]) else x["da052_3_"], axis=1)
  1349. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  1350. # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da053_1_"]
  1351. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  1352. # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da053_2_"]
  1353. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  1354. # health_status['Walking_2Hours_PerDay']=health_status["da053_3_"]
  1355. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  1356. # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da054_1_"]
  1357. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  1358. # health_status['Moderate_Effort_30Min_PerDay']=health_status["da054_2_"]
  1359. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  1360. # health_status['Walking_30Min_PerDay']=health_status["da054_3_"]
  1361. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  1362. # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da055_1_"]
  1363. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  1364. # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da055_2_"]
  1365. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  1366. # health_status['Walking_4Hours_PerDay']=health_status["da055_3_"]
  1367. # 活动的原因
  1368. # 1 工作需要
  1369. # 2 娱乐
  1370. # 3 体育锻炼
  1371. # 4 其他
  1372. # health_status["Reason_For_Vigorous_Activity"]= health_status["da051_1_1_"]
  1373. # health_status["Reason_For_Moderate_Effort"]= health_status["da051_1_2_"]
  1374. # health_status["Reason_For_Walking"]= health_status["da051_1_3_"]
  1375. # 过去一个月是否进行了下列社交活动?
  1376. # (1) 串门、跟朋友交往
  1377. # (2) 打麻将、下棋、打牌、去社区活动室
  1378. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  1379. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1380. # (5) 参加社团组织活动
  1381. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1382. # (7) 上学或者参加培训课程
  1383. # (8)其他
  1384. # (9) 以上均没有
  1385. health_status["da056_s1"] = health_status.apply(lambda x: 1 if x["da056_s1"]==1 else 0, axis=1)
  1386. health_status["da056_s2"] = health_status.apply(lambda x: 1 if x["da056_s2"]==2 else 0, axis=1)
  1387. health_status["da056_s3"] = health_status.apply(lambda x: 1 if x["da056_s3"]==3 else 0, axis=1)
  1388. health_status["da056_s4"] = health_status.apply(lambda x: 1 if x["da056_s4"]==4 else 0, axis=1)
  1389. health_status["da056_s5"] = health_status.apply(lambda x: 1 if x["da056_s5"]==5 else 0, axis=1)
  1390. health_status["da056_s6"] = health_status.apply(lambda x: 1 if x["da056_s6"]==6 or x["da056_s7"]==7 else 0, axis=1)
  1391. health_status["da056_s7"] = health_status.apply(lambda x: 1 if x["da056_s8"]==8 else 0, axis=1)
  1392. health_status["da056_s8"] = health_status.apply(lambda x: 1 if x["da056_s9"]==9 or x["da056_s10"]==10 or x["da056_s11"]==11 else 0, axis=1)
  1393. health_status["da056_s9"] = health_status.apply(lambda x: 1 if x["da056_s12"]==12 else 0, axis=1)
  1394. # 过去一个月的活动频率
  1395. # (1) Almost daily 差不多每天
  1396. # (2) Almost every week 差不多每周
  1397. # (3) Not regularly 不经常
  1398. # health_status["da057_6_"] = health_status.apply(lambda x: 1 if x["da057_6_"]==1 or x["da057_7_"]==1 else 2 if x["da057_6_"]==2 or x["da057_7_"]==2 else 3 if x["da057_6_"]==3 or x["da057_7_"]==3 else np.nan, axis=1)
  1399. # health_status["da057_7_"] = health_status["da057_8_"]
  1400. # health_status["da057_8_"] = health_status.apply(lambda x: 1 if x["da057_9_"]==1 or x["da057_10_"]==1 or x["da057_11_"]==1
  1401. # else 2 if x["da057_9_"]==2 or x["da057_10_"]==2 or x["da057_11_"]==2
  1402. # else 3 if x["da057_9_"]==3 or x["da057_10_"]==3 or x["da057_11_"]==3
  1403. # else np.nan, axis=1)
  1404. # 过去一个月,您是否上网?
  1405. health_status["Internet_Usage_LastMonth"] = health_status["da056_s10"].apply(lambda x : 1 if x==10 else 0)
  1406. # 使用以下哪些工具上网?
  1407. # 1. Desktop computer 台式电脑
  1408. # 2. Laptop computer 笔记本电脑
  1409. # 3. Tablet computer 平板电脑(如 IPAD)
  1410. # 4. Cellphone 手机
  1411. # # 5. Other devices 其他设备
  1412. # health_status["Internet_Tools_Desktop_computer"] = health_status["da056_w3_s1"]
  1413. # health_status["Internet_Tools_Laptop_computer"] = health_status["da056_w3_s2"]
  1414. # health_status["Internet_Tools_Tablet_computer"] = health_status["da056_w3_s3"]
  1415. # health_status["Internet_Tools_Cellphone"] = health_status["da056_w3_s4"]
  1416. # health_status["Internet_Tools_Other"] = health_status["da056_w3_s5"]
  1417. # # 上网一般做什么?
  1418. # health_status["Internet_Purpose_Chat"] = health_status["da056_w4_1_s1"]
  1419. # health_status["Internet_Purpose_news"] = health_status["da056_w4_1_s2"]
  1420. # health_status["Internet_Purpose_videos"] = health_status["da056_w4_1_s3"]
  1421. # health_status["Internet_Purpose_games"] = health_status["da056_w4_1_s4"]
  1422. # health_status["Internet_Purpose_Financial"] = health_status["da056_w4_1_s5"]
  1423. # health_status["Internet_Purpose_Others"] = health_status["da056_w4_1_s6"]
  1424. # # 是否会用手机支付
  1425. # # 1 是
  1426. # # 0 否
  1427. # health_status["Mobile_Payment"] = health_status["da056_w4_2"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1428. # # 是否使用微信?
  1429. # health_status["Wechat_Usage"] = health_status["da056_w4_3"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1430. # # 发不发微信朋友圈?
  1431. # health_status["Post_Moments"] = health_status["da056_w4_4"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1432. # # 现在还在吸烟还是戒烟了?
  1433. # # 1 仍然抽烟
  1434. # # 2 戒烟
  1435. # health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da061"] == 1 or x["da061_w4"] == 1 else 2 if x["da061"] == 2 or x["da061_w4"] == 2 else np.nan, axis=1)
  1436. # # 吸烟时,一般抽什么烟?
  1437. # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1438. # # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1439. # # (3) Filtered cigarette带滤咀香烟
  1440. # # (4) Unfiltered cigarette不带滤咀香烟
  1441. # # (5) Cigar雪茄
  1442. # # (6) Water cigarettes 水烟
  1443. # health_status.loc[health_status['da060'] == 1, 'Smoking_Type_pipe'] = 1
  1444. # health_status.loc[health_status['da060'] == 2, 'Smoking_Type_rolled'] = 2
  1445. # health_status.loc[health_status['da060'] == 3, 'Smoking_Type_Filtered'] = 3
  1446. # health_status.loc[health_status['da060'] == 4, 'Smoking_Type_Unfiltered'] = 4
  1447. # health_status.loc[health_status['da060'] == 5, 'Smoking_Type_Cigar'] = 5
  1448. # health_status.loc[health_status['da060'] == 6, 'Smoking_Type_Water'] = 6
  1449. # # 现在/戒烟前平均一天抽多少支香烟?
  1450. # health_status['Daily_Cigarette_Count']=health_status["da063"]
  1451. # 在过去的一年, 喝酒吗
  1452. # (1) Drink more than once a month. 喝酒,每月超过一次
  1453. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1454. # (3) None of these 什么都不喝
  1455. health_status['Drink_PastYear']=health_status["da067"]
  1456. # 过去一年内 平均一个月喝几次酒
  1457. # (1)Once a month 每月一次
  1458. # (2)2-3 times a month 每月2-3次
  1459. # (3)Once a week 每周一次
  1460. # (4)2-3 times a week 每周2-3次
  1461. # (5)4-6 times a week 每周4-6次
  1462. # (6)Once a day 每天一次
  1463. # (7)Twice a day 一天两次
  1464. # (8)More than twice a day 一天超过两次
  1465. # health_status['Drink_Monthly_Frequency']=health_status.apply(lambda x : 8 if x["da072"] ==8 or x["da074"] ==8 or x["da076"] ==8 else
  1466. # 7 if x["da072"] ==7 or x["da074"] ==7 or x["da076"] ==7 else
  1467. # 6 if x["da072"] ==6 or x["da074"] ==6 or x["da076"] ==6 else
  1468. # 5 if x["da072"] ==5 or x["da074"] ==5 or x["da076"] ==5 else
  1469. # 4 if x["da072"] ==4 or x["da074"] ==4 or x["da076"] ==4 else
  1470. # 3 if x["da072"] ==3 or x["da074"] ==3 or x["da076"] ==3 else
  1471. # 2 if x["da072"] ==2 or x["da074"] ==2 or x["da076"] ==2 else
  1472. # 1 if x["da072"] ==1 or x["da074"] ==1 or x["da076"] ==1 else np.nan, axis=1)
  1473. # 自上次访问以来的两年内,您是否发作过心脏病?
  1474. # 1 是
  1475. # 0 否
  1476. health_status["Heart_attack_2_years"] = health_status.apply(lambda x : 1 if x["da007_w2_5"] ==1 else
  1477. 0 if x["da007_w2_5"] == 2 else np.nan, axis=1)
  1478. # 自上次访问以来,是否有医生诊断您中风复发?
  1479. # 1 是
  1480. # 0 否
  1481. health_status['Recurrent_Stroke']=health_status.apply(lambda x : 1 if x["da019_w2_1"] ==1 else
  1482. 0 if x["da019_w2_1"] == 2 else np.nan, axis=1)
  1483. columns_to_diseases_old = ['da007_1_', 'da007_2_','da007_3_','da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  1484. ,'da007_12_','da007_13_','da007_14_']
  1485. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1486. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1487. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1488. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1489. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1490. diseases_2015 = data_2015[data_2015["wave"]=="2015"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1491. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1492. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1493. # 按 'ID' 列合并两个表
  1494. health_status = pd.merge(health_status, diseases_2015, on='ID', how='left', suffixes=("_2018","_2015"))
  1495. # 使用 fillna() 来更新字段
  1496. for col in columns_to_diseases_new:
  1497. health_status[col] = health_status[f'{col}_2018'].fillna(health_status[f'{col}_2015'])
  1498. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1499. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1500. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses" ,
  1501. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1502. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  1503. , "da056_s1", "da056_s2", "da056_s3", "da056_s4", "da056_s5", "da056_s6", "da056_s7", "da056_s8", "da056_s9"
  1504. , "Internet_Usage_LastMonth", "Drink_PastYear"]]
  1505. data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1506. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1507. cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1508. cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1509. cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1510. cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1511. # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1512. cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
  1513. cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
  1514. cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
  1515. cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
  1516. cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
  1517. #词语记忆
  1518. cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
  1519. cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
  1520. cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
  1521. cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
  1522. cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
  1523. cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
  1524. cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
  1525. cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
  1526. cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
  1527. cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
  1528. # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1529. cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
  1530. cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
  1531. cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
  1532. cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
  1533. cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
  1534. cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
  1535. cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
  1536. cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
  1537. cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
  1538. cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
  1539. # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  1540. #画图
  1541. cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  1542. data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  1543. cognition["dc001s3_score"] + cognition["dc002_score"]+ \
  1544. cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  1545. cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  1546. cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  1547. cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  1548. cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  1549. cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  1550. cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  1551. cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  1552. cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
  1553. cognition["draw_score"]
  1554. #心理得分
  1555. cognition["dc009_score"] = cognition["dc009"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1556. cognition["dc010_score"] = cognition["dc010"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1557. cognition["dc011_score"] = cognition["dc011"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1558. cognition["dc012_score"] = cognition["dc012"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1559. cognition["dc013_score"] = cognition["dc013"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1560. cognition["dc014_score"] = cognition["dc014"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1561. cognition["dc015_score"] = cognition["dc015"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1562. cognition["dc016_score"] = cognition["dc016"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1563. cognition["dc017_score"] = cognition["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1564. cognition["dc018_score"] = cognition["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1565. data_2018["psychiatric_score"] = cognition["dc009_score"] + cognition["dc010_score"] + cognition["dc011_score"] + \
  1566. cognition["dc012_score"] + cognition["dc013_score"] + cognition["dc014_score"] + cognition["dc015_score"] + \
  1567. cognition["dc016_score"] + cognition["dc017_score"] + cognition["dc018_score"]
  1568. #睡眠状态
  1569. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  1570. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  1571. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  1572. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  1573. data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
  1574. #ADL
  1575. health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1576. health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1577. health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1578. health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1579. health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1580. health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1581. data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  1582. health_status["db014_score"] + health_status["db015_score"]
  1583. # 是否有管道煤气或天然气?
  1584. houseing["Gas_Connection"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1585. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  1586. houseing["Heating_Facility"] = houseing["i020"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1587. # 供暖所用的主要能源是什么?
  1588. # (1)Solar 太阳能
  1589. # (2)Coal 煤炭、蜂窝煤
  1590. # (3)Natural gas 管道天然气或煤气
  1591. # (4)Liquefied Petroleum Gas 液化石油气
  1592. # (5)Electric 电
  1593. # (6)Crop residue/Wood buring 秸秆、柴火
  1594. # (7)Other 其他
  1595. houseing["Heating_Energy"] = houseing["i021_w4"].apply(lambda x : 0 if x==8 or x==7 else x )
  1596. # 做饭用的主要燃料是什么?
  1597. # (1)Coal 煤炭、蜂窝煤
  1598. # (2)Natural gas 管道天然气或煤气
  1599. # (3)Marsh gas 沼气
  1600. # (4)Liquefied Petroleum Gas 液化石油气
  1601. # (5)Electric 电
  1602. # (6)crop residue/Wood burning 秸秆、柴火
  1603. # (7)other 其他
  1604. houseing["Cooking_Fuel"] = houseing["i022_w4"].apply(lambda x : np.nan if x==8 else 0 if x==7 else x )
  1605. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  1606. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  1607. data_2018 = pd.merge(data_2018, houseing_select, on = ['householdID', 'communityID'], how="left")
  1608. data_2018["wave"] = year
  1609. change_columns(data_2018)
  1610. data_2018 = pd.concat([data_2015, data_2018], axis=0)
  1611. print("2018 complete")
  1612. # 2020年
  1613. year = "2020"
  1614. demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
  1615. psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
  1616. health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
  1617. weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
  1618. houseing, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Household_Income.dta")
  1619. #性别#年龄#婚姻状况
  1620. # 1 married or partnered
  1621. # 0 other marital status (separated, divorced, unmarried, or widowed)
  1622. demo["marital_status"] = demo.apply(lambda x : 1 if x["ba011"]==1 or x["ba011"]==2 or x["ba012"]==1 else 0 if x["ba011"] in [3,4,5,6] else np.nan, axis=1)
  1623. #教育
  1624. # 0 below high school
  1625. # 1 high school
  1626. # 2 college or above
  1627. demo["education"] = demo.apply(lambda x : x["ba010"] if not pd.isna(x["ba010"]) else np.nan, axis=1)
  1628. demo["education"] = demo["education"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
  1629. #合并2018年的教育
  1630. eductaion_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"education"]]
  1631. # 按 'ID' 列合并两个表
  1632. demo = pd.merge(demo, eductaion_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1633. # 使用 fillna() 来更新字段
  1634. demo['education'] = demo['education_2020'].fillna(demo['education_2018'])
  1635. # 出生年
  1636. demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
  1637. demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
  1638. #合并2018年的出生年
  1639. birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
  1640. # 按 'ID' 列合并两个表
  1641. demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1642. # 使用 fillna() 来更新字段
  1643. demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
  1644. demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
  1645. #获取随访时间
  1646. demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
  1647. demo["ba003"] = 1
  1648. data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
  1649. #居住地
  1650. # 0 农村
  1651. # 1 城市
  1652. data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city', 'urban_nbs']], on = "communityID", how="left")
  1653. #身高#体重#收缩压#舒张压
  1654. data_2020[['qi002', 'ql002', 'waist', 'Systolic','Diastolic']]=np.nan
  1655. #受试者可以在不用手臂支撑的情况下按其平时的节奏连续起立坐下五次吗
  1656. data_2020[["Sit_Stand_5x", "Walking_Speed_Time"]] = np.nan
  1657. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,葡萄糖glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  1658. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  1659. data_2020[['bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp','bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc']]=np.nan
  1660. # 慢性病:
  1661. # (1) Hypertension 高血压病
  1662. # (2) Dyslipidemia (elevation of low density lipoprotein, triglycerides (TGs),and total cholesterol, or a low high density lipoprotein level)血脂异常(包括低密度脂蛋白、甘油三酯、总胆固醇的升高或(和)高密度脂蛋白的下降)
  1663. # (3) Diabetes or high blood sugar糖尿病或血糖升高(包括糖耐量异常和空腹血糖升高)
  1664. # (4) Cancer or malignant tumor (excluding minor skin cancers) 癌症等恶性肿瘤(不包括轻度皮肤癌)
  1665. # (5) Chronic lung diseases, such as chronic bronchitis , emphysema ( excluding tumors, or cancer) 慢性肺部疾患如慢性支气管炎或肺气肿、肺心病(不包括肿瘤或癌)
  1666. # (6) Liver disease (except fatty liver, tumors, and cancer) 肝脏疾病
  1667. # (除脂肪肝、肿瘤或癌外)
  1668. # (7) Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems 心脏病(如心肌梗塞、冠心病、心绞痛、充血性心力衰竭和其他心脏疾病)
  1669. # (8) Stroke 中风
  1670. # (9) Kidney disease (except for tumor or cancer) 肾脏疾病(不包括肿瘤或癌)
  1671. # (10) Stomach or other digestive disease (except for tumor or cancer) 胃部疾病或消化系统疾病(不包括肿瘤或癌)
  1672. # (11) Emotional, nervous, or psychiatric problems 情感及精神方面问题
  1673. # (12) Memory-related disease 与记忆相关的疾病 (如老年痴呆症、脑萎缩、帕金森症)
  1674. # (13) Arthritis or rheumatism 关节炎或风湿病
  1675. # (14) Asthma 哮喘
  1676. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  1677. # 体力活动
  1678. # 2 vigorous (vigorous activity more than once a week)
  1679. # 1 moderate (moderate activity more than once a week)
  1680. # 0 inactive (the rest)
  1681. health_status["Physical_activity"] = health_status.apply(lambda x : 2 if x["da032_1_"]==1 else
  1682. 1 if x["da032_2_"]==1 else
  1683. 0 if x["da032_3_"] == 1 or (x["da032_1_"]==2 and x["da032_2_"]==2 and x["da032_3_"] == 2)
  1684. else np.nan ,axis=1)
  1685. # 抽烟
  1686. # 1 抽过烟
  1687. # 0 没有抽过烟
  1688. health_status["Smoke"] = health_status["da046"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else 1)
  1689. # 喝酒
  1690. # 1 喝过酒
  1691. # 0 没有喝过酒
  1692. health_status["Drink"] = health_status.apply(lambda x : 1 if x["da051"] ==1 or x["da051"] ==2 else
  1693. 0 if x["da051"] == 3 else np.nan, axis=1)
  1694. # 您是否经历过交通事故,或任何的重大意外伤害,并接受了治疗?
  1695. # 1 是
  1696. # 0 否
  1697. health_status['Accident_Or_Injury']=health_status.apply(lambda x : 1 if x["da019"] ==1 or x["da020"]==1 else
  1698. 0 if x["da019"] ==2 or x["da020"]==2 else np.nan, axis=1)
  1699. # 过去两年有没有摔倒?
  1700. # 1 是
  1701. # 0 否
  1702. health_status['Fell_In_Last2Years']=health_status.apply(lambda x : 1 if x["da022"] ==1 or x["da023"]==1 else
  1703. 0 if x["da022"] ==2 or x["da023"]==2 else np.nan, axis=1)
  1704. # # 您什么时候开始来月经的?(year/age)
  1705. # health_status[['Menarche_Year']]=np.nan
  1706. # health_status[['Menarche_Age']]=np.nan
  1707. # # 您什么时候开始绝经的?
  1708. # health_status[['Menopause_Year']]=np.nan
  1709. # health_status[['Menopause_Age']]=np.nan
  1710. # # 第一次诊断出您有前列腺疾病是在什么时候?
  1711. # health_status[['Prostate_Issue_Year']]=np.nan
  1712. # health_status[['Prostate_Issue_Age']]=np.nan
  1713. # 是否戴眼镜(包括矫正视力镜片)?
  1714. # 1 是
  1715. # 0 否
  1716. # 2 失明
  1717. # 3 偶尔
  1718. health_status[['Wear_Glasses']]=np.nan
  1719. # 过去一个月内,您平均每天晚上真正睡着的时间大约是几小时?(可能短于您在床上躺着的时间)
  1720. health_status['Average_Sleep_Hours']=health_status["da030"]
  1721. # 过去一个月内,您通常午睡多长时间?分钟
  1722. health_status['Average_Nap_Minutes']=health_status["da031"]
  1723. # 您通常每周有没有至少持续做激烈活动十分钟?
  1724. health_status['Vigorous_Activity_10Min']=health_status["da032_1_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1725. # 您通常每周有没有至少持续做中等强度的体力活动十分钟?
  1726. health_status['Moderate_Effort_10Min']=health_status["da032_2_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1727. # 您通常每周有没有至少持续走路十分钟?
  1728. health_status['Walking_10Min']=health_status["da032_3_"].apply(lambda x : 1 if x == 1 else 0 if x ==2 else np.nan)
  1729. # 您通常每周有多少天做[激烈活动]至少十分钟?
  1730. health_status['Vigorous_Activity_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Vigorous_Activity_10Min"]) else 0 if pd.isna(x["da033_1_"]) else x["da033_1_"], axis=1)
  1731. # 您通常每周有多少天做[中等强度的体力活动]至少十分钟?
  1732. health_status['Moderate_Effort_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Moderate_Effort_10Min"]) else 0 if pd.isna(x["da033_2_"]) else x["da033_2_"], axis=1)
  1733. # 您通常每周有多少天做[走路]至少十分钟?
  1734. health_status['Walking_Days']=health_status.apply(lambda x : np.nan if pd.isna(x["Walking_10Min"]) else 0 if pd.isna(x["da033_3_"]) else x["da033_3_"], axis=1)
  1735. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 2小时
  1736. # health_status['Vigorous_Activity_2Hours_PerDay']=health_status["da034_1_"]
  1737. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 2小时
  1738. # health_status['Moderate_Effort_2Hours_PerDay']=health_status["da034_2_"]
  1739. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 2小时
  1740. # health_status['Walking_2Hours_PerDay']=health_status["da034_3_"]
  1741. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 30分钟
  1742. # health_status['Vigorous_Activity_30Min_PerDay']=health_status["da035_1_"]
  1743. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 30分钟
  1744. # health_status['Moderate_Effort_30Min_PerDay']=health_status["da035_2_"]
  1745. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 30分钟
  1746. # health_status['Walking_30Min_PerDay']=health_status["da035_3_"]
  1747. # # 在做[激烈活动]的这些天里,您一天花多少时间做[激烈活动] 4小时
  1748. # health_status['Vigorous_Activity_4Hours_PerDay']=health_status["da036_1_"]
  1749. # # 在做[中等强度的体力活动]的这些天里,您一天花多少时间做[中等强度的体力活动] 4小时
  1750. # health_status['Moderate_Effort_4Hours_PerDay']=health_status["da036_2_"]
  1751. # # 在做[走路]的这些天里,您一天花多少时间做[走路] 4小时
  1752. # health_status['Walking_4Hours_PerDay']=health_status["da036_3_"]
  1753. # 活动的原因
  1754. # 1 工作需要
  1755. # 2 娱乐
  1756. # 3 体育锻炼
  1757. # 4 其他
  1758. # health_status["Reason_For_Vigorous_Activity"]= health_status["da037_1_"]
  1759. # health_status["Reason_For_Moderate_Effort"]= health_status["da037_2_"]
  1760. # health_status["Reason_For_Walking"]= health_status["da037_3_"]
  1761. # 过去一个月是否进行了下列社交活动?
  1762. # (1) 串门、跟朋友交往
  1763. # (2) 打麻将、下棋、打牌、去社区活动室
  1764. # (3) 无偿向与您不住在一起的亲人、朋友或者邻居提供帮助
  1765. # (4) 去公园或者其他场所跳舞、健身、练气功等
  1766. # (5) 参加社团组织活动
  1767. # (6) 志愿者活动或者慈善活动/无偿照顾与您不住在一起的病人或残疾人
  1768. # (7) 上学或者参加培训课程
  1769. # (8)其他
  1770. # (9) 以上均没有
  1771. health_status["da038_s1"] = health_status.apply(lambda x: 1 if x["da038_s1"]==1 else 0, axis=1)
  1772. health_status["da038_s2"] = health_status.apply(lambda x: 1 if x["da038_s2"]==2 else 0, axis=1)
  1773. health_status["da038_s3"] = health_status.apply(lambda x: 1 if x["da038_s3"]==3 else 0, axis=1)
  1774. health_status["da038_s4"] = health_status.apply(lambda x: 1 if x["da038_s4"]==4 else 0, axis=1)
  1775. health_status["da038_s5"] = health_status.apply(lambda x: 1 if x["da038_s5"]==5 else 0, axis=1)
  1776. health_status["da038_s6"] = health_status.apply(lambda x: 1 if x["da038_s6"]==6 else 0, axis=1)
  1777. health_status["da038_s7"] = health_status.apply(lambda x: 1 if x["da038_s7"]==7 else 0, axis=1)
  1778. health_status["da038_s8"] = health_status.apply(lambda x: 1 if x["da038_s8"]==8 else 0, axis=1)
  1779. health_status["da038_s9"] = health_status.apply(lambda x: 1 if x["da038_s9"]==9 else 0, axis=1)
  1780. # 过去一个月的活动频率
  1781. # (1) Almost daily 差不多每天
  1782. # (2) Almost every week 差不多每周
  1783. # (3) Not regularly 不经常
  1784. # 过去一个月,您是否上网?
  1785. # 1 是
  1786. # 0 否
  1787. health_status["Internet_Usage_LastMonth"] = health_status["da040"].apply(lambda x : 1 if x ==1 else 0)
  1788. # 使用以下哪些工具上网?
  1789. # 1. Desktop computer 台式电脑
  1790. # 2. Laptop computer 笔记本电脑
  1791. # 3. Tablet computer 平板电脑(如 IPAD)
  1792. # 4. Cellphone 手机
  1793. # # 5. Other devices 其他设备
  1794. # health_status["Internet_Tools_Desktop_computer"] = health_status["da041_s1"]
  1795. # health_status["Internet_Tools_Laptop_computer"] = health_status["da041_s2"]
  1796. # health_status["Internet_Tools_Tablet_computer"] = health_status["da041_s3"]
  1797. # health_status["Internet_Tools_Cellphone"] = health_status["da041_s4"]
  1798. # health_status["Internet_Tools_Other"] = health_status["da041_s5"]
  1799. # # 上网一般做什么?
  1800. # health_status["Internet_Purpose_Chat"] = health_status["da042_s1"]
  1801. # health_status["Internet_Purpose_news"] = health_status["da042_s2"]
  1802. # health_status["Internet_Purpose_videos"] = health_status["da042_s3"]
  1803. # health_status["Internet_Purpose_games"] = health_status["da042_s4"]
  1804. # health_status["Internet_Purpose_Financial"] = health_status["da042_s5"]
  1805. # health_status["Internet_Purpose_Others"] = health_status["da042_s6"]
  1806. # # 是否会用手机支付
  1807. # # 1 是
  1808. # # 0 否
  1809. # health_status["Mobile_Payment"] = health_status["da043"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1810. # # 是否使用微信?
  1811. # health_status["Wechat_Usage"] = health_status["da044"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1812. # # 发不发微信朋友圈?
  1813. # health_status["Post_Moments"] = health_status["da045"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1814. # # 现在还在吸烟还是戒烟了?
  1815. # # 1 仍然抽烟
  1816. # # 2 戒烟
  1817. # health_status['Current_Smoking_Status']=health_status.apply(lambda x : 1 if x["da047"] == 1 else 2 if x["da047"] == 2 else np.nan, axis=1)
  1818. # # 吸烟时,一般抽什么烟?
  1819. # # (1) Smoking a pipe 用烟管吸烟(烟袋、旱烟)
  1820. # # (2) Smoking self-rolled cigarettes 自己卷烟抽
  1821. # # (3) Filtered cigarette带滤咀香烟
  1822. # # (4) Unfiltered cigarette不带滤咀香烟
  1823. # # (5) Cigar雪茄
  1824. # # (6) Water cigarettes 水烟
  1825. # health_status.loc[health_status['da048'] == 1, 'Smoking_Type_pipe'] = 1
  1826. # health_status.loc[health_status['da048'] == 2, 'Smoking_Type_rolled'] = 2
  1827. # health_status.loc[health_status['da048'] == 3, 'Smoking_Type_Filtered'] = 3
  1828. # health_status.loc[health_status['da048'] == 4, 'Smoking_Type_Unfiltered'] = 4
  1829. # health_status.loc[health_status['da048'] == 5, 'Smoking_Type_Cigar'] = 5
  1830. # health_status.loc[health_status['da048'] == 6, 'Smoking_Type_Water'] = 6
  1831. # # 现在/戒烟前平均一天抽多少支香烟?
  1832. # health_status['Daily_Cigarette_Count']=health_status.apply(lambda x : x["da050_1"] if not pd.isna(x["da050_1"]) else x["da050_2"] if not pd.isna(x["da050_2"]) else np.nan, axis=1)
  1833. # 在过去的一年, 喝酒吗
  1834. # (1) Drink more than once a month. 喝酒,每月超过一次
  1835. # (2) Drink but less than once a month 喝酒,但每月少于一次
  1836. # (3) None of these 什么都不喝
  1837. health_status['Drink_PastYear']=health_status["da051"]
  1838. # 过去一年内 平均一个月喝几次酒
  1839. # (1)Once a month 每月一次
  1840. # (2)2-3 times a month 每月2-3次
  1841. # (3)Once a week 每周一次
  1842. # (4)2-3 times a week 每周2-3次
  1843. # (5)4-6 times a week 每周4-6次
  1844. # (6)Once a day 每天一次
  1845. # (7)Twice a day 一天两次
  1846. # (8)More than twice a day 一天超过两次
  1847. # health_status['Drink_Monthly_Frequency']=health_status["da052"]
  1848. health_status['da003_12_'] = health_status.apply(process_row, axis=1)
  1849. columns_to_diseases_old = ['da003_1_', 'da003_2_','da003_3_','da003_4_','da003_5_','da003_6_','da003_7_','da003_8_','da003_9_','da003_10_','da003_11_'
  1850. ,'da003_12_','da003_14_','da003_15_']
  1851. columns_to_diseases_new = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1852. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1853. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
  1854. for (col_old, col_new) in zip(columns_to_diseases_old,columns_to_diseases_new):
  1855. health_status[col_new] = health_status.apply(lambda x : x[col_old] if not pd.isna(x[col_old]) else np.nan, axis=1)
  1856. diseases_2018 = data_2018[data_2018["wave"]=="2018"][['ID','Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1857. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1858. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']]
  1859. # 按 'ID' 列合并两个表
  1860. health_status = pd.merge(health_status, diseases_2018, on='ID', how='left', suffixes=("_2020","_2018"))
  1861. # 使用 fillna() 来更新字段
  1862. for col in columns_to_diseases_new:
  1863. health_status[col] = health_status[f'{col}_2020'].fillna(health_status[f'{col}_2018'])
  1864. health_status_select = health_status[['ID','householdID', 'communityID', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases',
  1865. 'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease',
  1866. 'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', "Physical_activity", "Smoke", "Drink", "Accident_Or_Injury", "Fell_In_Last2Years", "Wear_Glasses",
  1867. "Average_Sleep_Hours", "Average_Nap_Minutes", "Vigorous_Activity_10Min", "Moderate_Effort_10Min"
  1868. , "Walking_10Min", "Vigorous_Activity_Days", "Moderate_Effort_Days", "Walking_Days"
  1869. , "da038_s1", "da038_s2", "da038_s3", "da038_s4", "da038_s5", "da038_s6", "da038_s7", "da038_s8", "da038_s9"
  1870. , "Internet_Usage_LastMonth", "Drink_PastYear"]]
  1871. data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
  1872. # 自上次访问以来的两年内,您是否发作过心脏病?
  1873. # 1 是
  1874. # 0 否
  1875. data_2020[['Heart_attack_2_years']]=np.nan
  1876. # 自上次访问以来,是否有医生诊断您中风复发?
  1877. # 1 是
  1878. # 0 否
  1879. data_2020[['Recurrent_Stroke']]=np.nan
  1880. #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
  1881. health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1882. health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1883. health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1884. health_status["dc002_score"] = health_status["dc004"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1885. health_status["dc003_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1886. health_status["dc019_score"] = health_status.apply(lambda x : 0 if x["dc007_1"]==997 else 1 if x["dc007_1"] ==1 and x["dc007_1_1"]==93 else 0 if x["dc007_1"] ==1 and (not x["dc007_1_1"]==93) else np.nan, axis=1)
  1887. health_status["dc020_score"] = health_status.apply(lambda x : 0 if x["dc007_2"]==997 else 1 if x["dc007_2"] ==1 and x["dc007_2_1"]==86 else 0 if x["dc007_2"] ==1 and (not x["dc007_2_1"]==86) else np.nan, axis=1)
  1888. health_status["dc021_score"] = health_status.apply(lambda x : 0 if x["dc007_3"]==997 else 1 if x["dc007_3"] ==1 and x["dc007_3_1"]==79 else 0 if x["dc007_3"] ==1 and (not x["dc007_3_1"]==79) else np.nan, axis=1)
  1889. health_status["dc022_score"] = health_status.apply(lambda x : 0 if x["dc007_4"]==997 else 1 if x["dc007_4"] ==1 and x["dc007_4_1"]==72 else 0 if x["dc007_4"] ==1 and (not x["dc007_4_1"]==72) else np.nan, axis=1)
  1890. health_status["dc023_score"] = health_status.apply(lambda x : 0 if x["dc007_5"]==997 else 1 if x["dc007_5"] ==1 and x["dc007_5_1"]==65 else 0 if x["dc007_5"] ==1 and (not x["dc007_5_1"]==65) else np.nan, axis=1)
  1891. #词语记忆
  1892. health_status["dc006s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s1"]==1 else 0, axis=1)
  1893. health_status["dc006s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s2"]==2 else 0, axis=1)
  1894. health_status["dc006s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s3"]==3 else 0, axis=1)
  1895. health_status["dc006s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s4"]==4 else 0, axis=1)
  1896. health_status["dc006s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s5"]==5 else 0, axis=1)
  1897. health_status["dc006s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s6"]==6 else 0, axis=1)
  1898. health_status["dc006s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s7"]==7 else 0, axis=1)
  1899. health_status["dc006s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s8"]==8 else 0, axis=1)
  1900. health_status["dc006s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s9"]==9 else 0, axis=1)
  1901. health_status["dc006s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc012_s10"]==10 else 0, axis=1)
  1902. health_status["dc027s1_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s1"]==1 else 0, axis=1)
  1903. health_status["dc027s2_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s2"]==2 else 0, axis=1)
  1904. health_status["dc027s3_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s3"]==3 else 0, axis=1)
  1905. health_status["dc027s4_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s4"]==4 else 0, axis=1)
  1906. health_status["dc027s5_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s5"]==5 else 0, axis=1)
  1907. health_status["dc027s6_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s6"]==6 else 0, axis=1)
  1908. health_status["dc027s7_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s7"]==7 else 0, axis=1)
  1909. health_status["dc027s8_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s8"]==8 else 0, axis=1)
  1910. health_status["dc027s9_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s9"]==9 else 0, axis=1)
  1911. health_status["dc027s10_score"] = health_status.apply(lambda x : np.nan if not x["xwordrecallbr"] ==1 else 1 if x["dc028_s10"]==10 else 0, axis=1)
  1912. #画图
  1913. health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
  1914. data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
  1915. health_status["dc001s3_score"] + health_status["dc002_score"]+ \
  1916. health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
  1917. health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
  1918. health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
  1919. health_status["dc006s5_score"] + health_status["dc006s6_score"] + health_status["dc006s7_score"] + \
  1920. health_status["dc006s8_score"] + health_status["dc006s9_score"] + health_status["dc006s10_score"] + \
  1921. health_status["dc027s1_score"]+ health_status["dc027s2_score"]+ \
  1922. health_status["dc027s3_score"]+ health_status["dc027s4_score"]+ health_status["dc027s5_score"]+ \
  1923. health_status["dc027s6_score"]+ health_status["dc027s7_score"]+ health_status["dc027s8_score"]+ \
  1924. health_status["dc027s9_score"]+health_status["dc027s10_score"]+\
  1925. health_status["draw_score"]
  1926. #心理得分
  1927. health_status["dc009_score"] = health_status["dc016"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1928. health_status["dc010_score"] = health_status["dc017"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1929. health_status["dc011_score"] = health_status["dc018"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1930. health_status["dc012_score"] = health_status["dc019"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1931. health_status["dc013_score"] = health_status["dc020"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1932. health_status["dc014_score"] = health_status["dc021"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1933. health_status["dc015_score"] = health_status["dc022"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1934. health_status["dc016_score"] = health_status["dc023"].apply(lambda x: 4-x if (not pd.isna(x)) and x <5 else np.nan)
  1935. health_status["dc017_score"] = health_status["dc024"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1936. health_status["dc018_score"] = health_status["dc025"].apply(lambda x: x-1 if (not pd.isna(x)) and x <5 else np.nan)
  1937. data_2020["psychiatric_score"] = health_status["dc009_score"] + health_status["dc010_score"] + health_status["dc011_score"] + \
  1938. health_status["dc012_score"] + health_status["dc013_score"] + health_status["dc014_score"] + health_status["dc015_score"] + \
  1939. health_status["dc016_score"] + health_status["dc017_score"] + health_status["dc018_score"]
  1940. #睡眠状态
  1941. # (1)Rarely or none of the time (<1 day) 很少或者根本没有(<1天)
  1942. # (2)Some or a little of the time (1-2 days) 不太多(1-2天)
  1943. # (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
  1944. # (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
  1945. data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
  1946. #ADL
  1947. health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1948. health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1949. health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1950. health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1951. health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1952. health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
  1953. data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
  1954. health_status["db014_score"] + health_status["db015_score"]
  1955. # 是否有管道煤气或天然气?
  1956. houseing["Gas_Connection"] = houseing["i018"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1957. # 是否带供暖设施(不包括土暖气和可制暖的空调)?
  1958. houseing["Heating_Facility"] = houseing["i019"].apply(lambda x : 1 if x ==1 else 0 if x == 2 else np.nan)
  1959. # 供暖所用的主要能源是什么?
  1960. # (1)Solar 太阳能
  1961. # (2)Coal 煤炭、蜂窝煤
  1962. # (3)Natural gas 管道天然气或煤气
  1963. # (4)Liquefied Petroleum Gas 液化石油气
  1964. # (5)Electric 电
  1965. # (6)Crop residue/Wood buring 秸秆、柴火
  1966. # (7)Other 其他
  1967. houseing["Heating_Energy"] = houseing["i020"].apply(lambda x : np.nan if x==8 else 0 if x==7 else x )
  1968. # 做饭用的主要燃料是什么?
  1969. # (1)Coal 煤炭、蜂窝煤
  1970. # (2)Natural gas 管道天然气或煤气
  1971. # (3)Marsh gas 沼气
  1972. # (4)Liquefied Petroleum Gas 液化石油气
  1973. # (5)Electric 电
  1974. # (6)crop residue/Wood burning 秸秆、柴火
  1975. # (7)other 其他
  1976. houseing["Cooking_Fuel"] = houseing["i021"].apply(lambda x : np.nan if x==9 else 0 if x == 8 or x == 7 else x)
  1977. houseing_select = houseing[['householdID', 'communityID','Gas_Connection',
  1978. 'Heating_Facility', 'Heating_Energy', 'Cooking_Fuel']]
  1979. data_2020 = pd.merge(data_2020, houseing_select, on = ['householdID', 'communityID'], how="left")
  1980. data_2020["wave"] = year
  1981. change_columns(data_2020)
  1982. data_2020 = pd.concat([data_2018, data_2020], axis=0)
  1983. #修改地区名称
  1984. #省份、城市名称和污染物数据格式对齐
  1985. #海东地区->海东市
  1986. data_2020['city'] = data_2020['city'].replace('海东地区', '海东市')
  1987. #北京 -> 北京市
  1988. data_2020['city'] = data_2020['city'].replace('北京', '北京市')
  1989. data_2020['province'] = data_2020['province'].replace('北京', '北京市')
  1990. #哈尔滨 -> 哈尔滨市
  1991. data_2020['city'] = data_2020['city'].replace('哈尔滨', '哈尔滨市')
  1992. #天津 -> 天津市
  1993. data_2020['city'] = data_2020['city'].replace('天津', '天津市')
  1994. data_2020['province'] = data_2020['province'].replace('天津', '天津市')
  1995. #广西省 -> 广西壮族自治区
  1996. data_2020['province'] = data_2020['province'].replace('广西省', '广西壮族自治区')
  1997. #巢湖市 -> 合肥市
  1998. data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
  1999. #襄樊市->襄阳市
  2000. data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
  2001. data_2020.to_csv("charls_paper_2.csv", index=False)
  2002. print(123)