CHARLS_preprocess.r 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. #biomarkers体检信息
  2. #community社区信息(只有2011年数据)
  3. #demographic_background个人信息
  4. #family_information家庭成员信息
  5. #family_transfer家庭经济关系
  6. #individual_income 个人收入及资产
  7. #household_income家户收入、支出及资产(2011和2013需要计算,转exp_income_wealth)
  8. #household_roster家庭成员信息(2011),后面分成三块parent, Child, Other_HHmember
  9. #housing_characteristics住房信息
  10. #Exit_Interview退出信息(2013)
  11. #Verbal_Autopsy死因信息(2013)
  12. #Exit_Module退出问卷(2020)
  13. #exp_income_wealth 家庭收入已统计好
  14. #interviewer_observation访问员观察
  15. #psu社区代码与城市对应关系
  16. #特色模块
  17. #health_care_and_insurance医疗保健与保险
  18. #health_status_and_functioning健康状况与功能
  19. #work_retirement_and_pension工作退休及养老金
  20. #注:中间有一些跳转操作需要判断,主要是和变量缺失进行区分
  21. # install.packages("haven")
  22. # install.packages("readstata13",repos = "https://mirrors.sjtug.sjtu.edu.cn/cran/")
  23. library(haven)
  24. library(readstata13)
  25. library(dplyr)
  26. year = "2020"
  27. path = "/root/r_base/CHARLS/"
  28. # 读取文件
  29. if(year == "2011"){
  30. demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/demographic_background.dta"))
  31. psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/psu.dta"), encoding = "GBK")
  32. biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/biomarkers.dta"))
  33. blood <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Blood_20140429.dta"))
  34. health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/health_status_and_functioning.dta"))
  35. health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/health_care_and_insurance.dta"))
  36. exp_income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/exp_income_wealth.dta"))
  37. }else if(year == "2013"){
  38. demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
  39. psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/PSU.dta"), encoding = "GBK")
  40. biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Biomarker.dta"))
  41. health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
  42. health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
  43. exp_income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/exp_income_wealth.dta"))
  44. }else if (year == "2015"){
  45. demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
  46. psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS","2013","/PSU.dta"), encoding = "GBK")
  47. biomarkers <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Biomarker.dta"))
  48. blood <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Blood.dta"))
  49. health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
  50. health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
  51. Household_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Household_Income.dta"))
  52. Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
  53. }else if(year == '2018'){
  54. demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
  55. psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",'2013',"/PSU.dta"), encoding = "GBK")
  56. health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
  57. health_care <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Care_and_Insurance.dta"))
  58. Household_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Household_Income.dta"))
  59. Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
  60. Cognition <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Cognition.dta"))
  61. }else{
  62. demo <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Demographic_Background.dta"))
  63. psu <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",'2013',"/PSU.dta"), encoding = "GBK")
  64. health_status <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Health_Status_and_Functioning.dta"))
  65. Household_Income <- read_dta(paste0("CHARLS/CHARLS",year,"/Household_Income.dta"))
  66. Individual_Income <- read_dta(paste0("/root/r_base/CHARLS/CHARLS",year,"/Individual_Income.dta"))
  67. }
  68. #性别#年龄#居住地#婚姻状况
  69. if(year == '2011'){
  70. data <- demo[, c('ID','householdID', 'communityID','rgender','ba002_1','be001')]
  71. }else if(year == "2013"){
  72. data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3','ba002_1','be001')]
  73. }else if(year == "2015"){
  74. data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001')]
  75. }else if(year == "2018"){
  76. data <- demo[, c('ID','householdID', 'communityID','ba000_w2_3', 'ba004_w3_1', 'be001')]
  77. }else if(year == "2020"){
  78. data <- demo[, c('ID','householdID', 'communityID','ba001', 'ba003_1','ba011')]
  79. }
  80. #性别
  81. colnames(data)[4] <- "gender"
  82. #年龄
  83. colnames(data)[5] <- "born_year"
  84. #婚姻状况
  85. colnames(data)[6] <- "be001"
  86. data$age <- ifelse(is.na(data$born_year), NA, as.numeric(year)-data$born_year)
  87. data$wave <- year
  88. #居住地
  89. data <- merge(data, psu[,c('communityID', 'province', 'city')], by = "communityID", all.x = TRUE)
  90. #省份、城市名称和污染物数据格式对齐
  91. #海东地区->海东市
  92. data$city[data$city == "海东地区"] <- "海东市"
  93. #北京 -> 北京市
  94. data$city[data$city == "北京"] <- "北京市"
  95. data$province[data$province == "北京"] <- "北京市"
  96. #哈尔滨 -> 哈尔滨市
  97. data$city[data$city == "哈尔滨"] <- "哈尔滨市"
  98. #天津 -> 天津市
  99. data$city[data$city == "天津"] <- "天津市"
  100. data$province[data$province == "天津"] <- "天津市"
  101. #广西省 -> 广西壮族自治区
  102. data$province[data$province == "广西省"] <- "广西壮族自治区"
  103. #巢湖市 -> 合肥市
  104. data$city[data$city == "巢湖市"] <- "合肥市"
  105. #襄樊市->襄阳市
  106. data$city[data$city == "襄樊市"] <- "襄阳市"
  107. #身高#体重#收缩压#舒张压#脉搏
  108. if(year == '2011'){
  109. biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002','ql002','qa011','qa012', 'qa013')]
  110. }else if(year == "2013"){
  111. biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002','ql002','qa011','qa012', 'qa013')]
  112. }else if(year == "2015"){
  113. biomarkers_select <- biomarkers[, c('ID','householdID', 'communityID','qi002', 'ql002', 'qa011','qa012', 'qa013')]
  114. }
  115. if (year == '2011' | year == '2013' | year == '2015'){
  116. data <- merge(data, biomarkers_select, by = c('ID','householdID', 'communityID'), all.x = TRUE)
  117. }else{
  118. # 列名列表
  119. new_columns <- c('qi002', 'ql002', 'qa011','qa012', 'qa013')
  120. # 通过循环创建新的列并赋值为NA
  121. for (col_name in new_columns) {
  122. data[[col_name]] <- NA
  123. }
  124. }
  125. #白细胞(WBC),平均红血球容积MCV,血小板,血尿素氮bun,谷氨酸glu,血肌酐crea,总胆固醇cho,甘油三酯tg,高密度脂蛋白HDL,低密度脂蛋白胆固醇LDL,C反应蛋白CRP
  126. #糖化血红蛋白hba1c,尿酸ua,血细胞比容Hematocrit,血红蛋白hgb,胱抑素C
  127. if(year == '2011'){
  128. blood <- subset(blood, select = -c(bloodweight, qc1_va003))
  129. }else if(year == '2015'){
  130. blood <- blood[, c('ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
  131. , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')]
  132. }
  133. if(year == '2011' | year == '2015'){
  134. colnames(blood) <- c('ID', 'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
  135. , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')
  136. data <- merge(data, blood, by = c('ID'), all.x = TRUE)
  137. }else{
  138. # 列名列表
  139. new_columns <- c('bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp'
  140. , 'bl_hbalc','bl_ua', 'bl_hct', 'bl_hgb','bl_cysc')
  141. # 通过循环创建新的列并赋值为NA
  142. for (col_name in new_columns) {
  143. data[[col_name]] <- NA
  144. }
  145. }
  146. #健康状况与功能:
  147. if(year == '2018'){
  148. health_status$general_helth_status <- health_status$da002
  149. }else if(year =="2020"){
  150. health_status$general_helth_status <- health_status$da001
  151. }else{
  152. health_status$general_helth_status <- ifelse(is.na(health_status$da001), health_status$da002, health_status$da001)
  153. }
  154. #患病情况、运动情况、抽烟情况、饮酒情况
  155. if(year == '2013'){
  156. names(health_status)[names(health_status) %in% c("dc006_1_s1", "dc006_1_s2", 'dc006_1_s3', 'dc006_1_s4', 'dc006_1_s5','dc006_1_s6',
  157. 'dc006_1_s7', 'dc006_1_s8','dc006_1_s9', 'dc006_1_s10','dc006_1_s11')]<- c("dc006s1", "dc006s2", 'dc006s3',
  158. 'dc006s4', 'dc006s5','dc006s6', 'dc006s7','dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
  159. }else if(year == "2018"){
  160. names(Cognition)[names(Cognition) %in% c("dc001_w4", "dc006_w4",
  161. 'dc003_w4', 'dc005_w4', 'dc002_w4')]<- c("dc001s1", "dc001s2",'dc001s3','dc002','dc003')
  162. names(Cognition)[names(Cognition) %in% c("dc014_w4_1_1", "dc014_w4_2_1",
  163. 'dc014_w4_3_1', 'dc014_w4_4_1', 'dc014_w4_5_1')]<- c("dc019", "dc020",
  164. 'dc021', 'dc022', 'dc023')
  165. names(Cognition)[names(Cognition) %in% c("dc028_w4_s1", "dc028_w4_s2", 'dc028_w4_s3', 'dc028_w4_s4', 'dc028_w4_s5','dc028_w4_s6',
  166. 'dc028_w4_s7', 'dc028_w4_s8',
  167. 'dc028_w4_s9', 'dc028_w4_s10',
  168. 'dc028_w4_s11')]<- c("dc006s1", "dc006s2", 'dc006s3',
  169. 'dc006s4', 'dc006s5','dc006s6', 'dc006s7',
  170. 'dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
  171. names(Cognition)[names(Cognition) %in% c("dc047_w4_s1", "dc047_w4_s2", 'dc047_w4_s3', 'dc047_w4_s4', 'dc047_w4_s5','dc047_w4_s6',
  172. 'dc047_w4_s7', 'dc047_w4_s8',
  173. 'dc047_w4_s9', 'dc047_w4_s10',
  174. 'dc047_w4_s11', 'dc024_w4')]<- c("dc027s1", "dc027s2", 'dc027s3',
  175. 'dc027s4', 'dc027s5','dc027s6', 'dc027s7',
  176. 'dc027s8', 'dc027s9', 'dc027s10', 'dc027s11','dc025')
  177. }else if (year == "2020"){
  178. #词语记忆,第一遍
  179. names(health_status)[names(health_status) %in% c("dc012_s1", "dc012_s2", 'dc012_s3', 'dc012_s4', 'dc012_s5','dc012_s6','dc012_s7', 'dc012_s8','dc012_s9', 'dc012_s10',
  180. 'dc012_s11')]<- c("dc006s1", "dc006s2", 'dc006s3','dc006s4', 'dc006s5','dc006s6', 'dc006s7','dc006s8', 'dc006s9', 'dc006s10', 'dc006s11')
  181. #词语记忆,第二遍
  182. names(health_status)[names(health_status) %in% c("dc028_s1", "dc028_s2", 'dc028_s3', 'dc028_s4', 'dc028_s5','dc028_s6','dc028_s7', 'dc028_s8','dc028_s9', 'dc028_s10',
  183. 'dc028_s11')]<- c("dc027s1", "dc027s2", 'dc027s3','dc027s4', 'dc027s5','dc027s6', 'dc027s7',
  184. 'dc027s8', 'dc027s9', 'dc027s10', 'dc027s11')
  185. }
  186. #日常生活活动能力(ADL):包括上厕所、吃饭、穿衣、控制大小便、上下床、洗澡6个条目,若其中有一项需要他人帮助,则视为ADL失能。>0为失能
  187. if (year == "2020"){
  188. health_status$db010_score <- ifelse(health_status$db001 > 2, 1, 0)
  189. health_status$db011_score <- ifelse(health_status$db003 > 2, 1, 0)
  190. health_status$db012_score <- ifelse(health_status$db005 > 2, 1, 0)
  191. health_status$db013_score <- ifelse(health_status$db007 > 2, 1, 0)
  192. health_status$db014_score <- ifelse(health_status$db009 > 2, 1, 0)
  193. health_status$db015_score <- ifelse(health_status$db011 > 2, 1, 0)
  194. }else{
  195. health_status$db010_score <- ifelse(health_status$db010 > 2, 1, 0)
  196. health_status$db011_score <- ifelse(health_status$db011 > 2, 1, 0)
  197. health_status$db012_score <- ifelse(health_status$db012 > 2, 1, 0)
  198. health_status$db013_score <- ifelse(health_status$db013 > 2, 1, 0)
  199. health_status$db014_score <- ifelse(health_status$db014 > 2, 1, 0)
  200. health_status$db015_score <- ifelse(health_status$db015 > 2, 1, 0)
  201. }
  202. health_status$ADL_score <- apply(health_status[,c('db010_score','db011_score','db012_score', 'db013_score', 'db014_score'
  203. ,'db015_score')], 1, function(x) sum(x))
  204. #IADL:包括做家务、做饭、购物、吃药、管理财务5个条目,若其中有一项需要他人帮助,则视为IADL失能。
  205. if (year =='2020'){
  206. health_status$db016_score <- ifelse(health_status$db012 > 2, 1, 0)
  207. health_status$db017_score <- ifelse(health_status$db014 > 2, 1, 0)
  208. health_status$db018_score <- ifelse(health_status$db016 > 2, 1, 0)
  209. health_status$db019_score <- ifelse(health_status$db020 > 2, 1, 0)
  210. health_status$db020_score <- ifelse(health_status$db022 > 2, 1, 0)
  211. }else{
  212. health_status$db016_score <- ifelse(health_status$db016 > 2, 1, 0)
  213. health_status$db017_score <- ifelse(health_status$db017 > 2, 1, 0)
  214. health_status$db018_score <- ifelse(health_status$db018 > 2, 1, 0)
  215. health_status$db019_score <- ifelse(health_status$db019 > 2, 1, 0)
  216. health_status$db020_score <- ifelse(health_status$db020 > 2, 1, 0)
  217. }
  218. health_status$IADL_score <- apply(health_status[,c('db016_score','db017_score','db018_score', 'db019_score', 'db020_score')], 1, function(x) sum(x))
  219. if(year == "2020"){
  220. #2020年疾病的label和其他年份不一样,需要处理
  221. # 指定需要处理的列
  222. columns_to_process <- c('da002_1_', 'da002_2_','da002_3_'
  223. ,'da002_4_','da002_5_','da002_6_','da002_7_','da002_8_','da002_9_','da002_10_','da002_11_'
  224. ,'da002_12_','da002_13_','da002_14_','da002_15_')
  225. # 使用 mutate_at() 对指定列进行处理
  226. health_status <- health_status %>%
  227. mutate_at(vars(columns_to_process), ~ case_when(
  228. . == 99 ~ 2,
  229. . %in% 1:3 ~ 1,
  230. TRUE ~ NA_real_
  231. ))
  232. # 2020年把帕金森和记忆病症分开,需要和以前对齐
  233. # 使用 mutate() 和 case_when() 实现条件逻辑处理
  234. health_status <- health_status %>%
  235. mutate(
  236. da002_12_ = case_when(
  237. da002_12_ == 1 | da002_13_ == 1 ~ 1,
  238. da002_12_ == 2 & da002_13_ == 2 ~ 2,
  239. da002_12_ == 2 & is.na(da002_13_) | is.na(da002_12_) & da002_13_ == 2 ~ 2,
  240. is.na(da002_12_) & is.na(da002_13_) ~ NA_real_,
  241. TRUE ~ NA_real_ # 预防万一,其余情况下设为NA
  242. )
  243. )
  244. health_status_select <- health_status[, c('ID','householdID', 'communityID', 'general_helth_status'
  245. ,'ADL_score', 'IADL_score', 'da002_1_', 'da002_2_','da002_3_'
  246. ,'da002_4_','da002_5_','da002_6_','da002_7_','da002_8_','da002_9_','da002_10_','da002_11_'
  247. ,'da002_12_','da002_14_','da002_15_','da032_1_','da032_2_', 'da032_3_'
  248. ,'da033_1_','da033_2_','da033_3_','da034_1_','da034_2_','da034_3_','da035_1_','da035_2_','da035_3_'
  249. ,'da036_1_','da036_2_','da036_3_', 'da046','da047','da050_1'
  250. ,'da051')]
  251. health_status_select$da051 <- ifelse(health_status_select$da051==1, 3, ifelse(health_status_select$da051==3, 1, health_status_select$da051))
  252. }else{
  253. health_status_select <- health_status[, c('ID','householdID', 'communityID', 'general_helth_status'
  254. ,'ADL_score', 'IADL_score', 'da007_1_', 'da007_2_','da007_3_'
  255. ,'da007_4_','da007_5_','da007_6_','da007_7_','da007_8_','da007_9_','da007_10_','da007_11_'
  256. ,'da007_12_','da007_13_','da007_14_','da051_1_','da051_2_', 'da051_3_'
  257. ,'da052_1_','da052_2_','da052_3_','da053_1_','da053_2_','da053_3_','da054_1_','da054_2_','da054_3_'
  258. ,'da055_1_','da055_2_','da055_3_', 'da059','da061','da063'
  259. ,'da069')]
  260. }
  261. colnames(health_status_select) <- c('ID', 'householdID', 'communityID', 'general_helth_status'
  262. ,'ADL_score', 'IADL_score', 'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar'
  263. ,'Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 'Liver_Disease', 'Heart_Problems', 'Stroke', ' Kidney_Diease'
  264. ,'Stomach_or_Other_Digestive_Disease', 'Emotional_Nervous_or_Psychiatric_Problems', ' Memory_Related_Disease',' Arthritis_or_Rheumatism'
  265. ,'Asthma', 'Vigorous_Activities', 'Moderate_Physical_Effort','Walking'
  266. ,'Vigorous_Activities_day', 'Moderate_Physical_Effort_day','Walking_day','Vigorous_Activities_2h', 'Moderate_Physical_Effort_2h','Walking_2h'
  267. ,'Vigorous_Activities_30m', 'Moderate_Physical_Effort_30m','Walking_30m','Vigorous_Activities_4h', 'Moderate_Physical_Effort_4h','Walking_4h'
  268. ,'Smoke', 'Smoke_still','Number_Cigarettes','Drink')
  269. data <- merge(data, health_status_select, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
  270. if(year =="2018"){
  271. health_status = Cognition
  272. }
  273. #计算认知功能得分,分成三部分:电话问卷10分,词语回忆20分、画图1分
  274. if(year == "2020"){
  275. health_status$dc001s1_score <- ifelse(is.na(health_status$dc001), 0, ifelse(health_status$dc001 == 1, 1, 0))
  276. health_status$dc001s2_score <- ifelse(is.na(health_status$dc005), 0, ifelse(health_status$dc005 == 2, 1, 0))
  277. health_status$dc001s3_score <- ifelse(is.na(health_status$dc003), 0, ifelse(health_status$dc003 == 3, 1, 0))
  278. health_status$dc002_score <- ifelse(is.na(health_status$dc004), 0, ifelse(health_status$dc004 == 1, 1, 0))
  279. health_status$dc003_score <- ifelse(is.na(health_status$dc002), 0, ifelse(health_status$dc002 == 1, 1, 0))
  280. health_status$dc019_score <- ifelse(is.na(health_status$dc007_1), 0, ifelse(health_status$dc007_1 == 93, 1, 0))
  281. health_status$dc020_score <- ifelse(is.na(health_status$dc007_2), 0, ifelse(health_status$dc007_2 == 86, 1, 0))
  282. health_status$dc021_score <- ifelse(is.na(health_status$dc007_3), 0, ifelse(health_status$dc007_3 == 79, 1, 0))
  283. health_status$dc022_score <- ifelse(is.na(health_status$dc007_4), 0, ifelse(health_status$dc007_4 == 72, 1, 0))
  284. health_status$dc023_score <- ifelse(is.na(health_status$dc007_5), 0, ifelse(health_status$dc007_5 == 65, 1, 0))
  285. }else{
  286. health_status$dc001s1_score <- ifelse(is.na(health_status$dc001s1), 0, ifelse(health_status$dc001s1 == 1, 1, 0))
  287. health_status$dc001s2_score <- ifelse(is.na(health_status$dc001s2), 0, ifelse(health_status$dc001s2 == 2, 1, 0))
  288. health_status$dc001s3_score <- ifelse(is.na(health_status$dc001s3), 0, ifelse(health_status$dc001s3 == 3, 1, 0))
  289. health_status$dc002_score <- ifelse(is.na(health_status$dc002), 0, ifelse(health_status$dc002 == 1, 1, 0))
  290. health_status$dc003_score <- ifelse(is.na(health_status$dc003), 0, ifelse(health_status$dc003 == 1, 1, 0))
  291. health_status$dc019_score <- ifelse(is.na(health_status$dc019), 0, ifelse(health_status$dc019 == 93, 1, 0))
  292. health_status$dc020_score <- ifelse(is.na(health_status$dc020), 0, ifelse(health_status$dc020 == 86, 1, 0))
  293. health_status$dc021_score <- ifelse(is.na(health_status$dc021), 0, ifelse(health_status$dc021 == 79, 1, 0))
  294. health_status$dc022_score <- ifelse(is.na(health_status$dc022), 0, ifelse(health_status$dc022 == 72, 1, 0))
  295. health_status$dc023_score <- ifelse(is.na(health_status$dc023), 0, ifelse(health_status$dc023 == 65, 1, 0))
  296. }
  297. health_status$Cognitive_functioning <- apply(health_status[,c('dc001s1_score','dc001s2_score','dc001s3_score', 'dc002_score', 'dc003_score'
  298. ,'dc019_score','dc020_score','dc021_score','dc022_score','dc023_score')], 1, function(x) sum(x))
  299. #词语记忆
  300. health_status$dc006s1_score <- ifelse(is.na(health_status$dc006s1), 0, ifelse(health_status$dc006s1 == 1, 1, 0))
  301. health_status$dc006s2_score <- ifelse(is.na(health_status$dc006s2), 0, ifelse(health_status$dc006s2 == 2, 1, 0))
  302. health_status$dc006s3_score <- ifelse(is.na(health_status$dc006s3), 0, ifelse(health_status$dc006s3 == 3, 1, 0))
  303. health_status$dc006s4_score <- ifelse(is.na(health_status$dc006s4), 0, ifelse(health_status$dc006s4 == 4, 1, 0))
  304. health_status$dc006s5_score <- ifelse(is.na(health_status$dc006s5), 0, ifelse(health_status$dc006s5 == 5, 1, 0))
  305. health_status$dc006s6_score <- ifelse(is.na(health_status$dc006s6), 0, ifelse(health_status$dc006s6 == 6, 1, 0))
  306. health_status$dc006s7_score <- ifelse(is.na(health_status$dc006s7), 0, ifelse(health_status$dc006s7 == 7, 1, 0))
  307. health_status$dc006s8_score <- ifelse(is.na(health_status$dc006s8), 0, ifelse(health_status$dc006s8 == 8, 1, 0))
  308. health_status$dc006s9_score <- ifelse(is.na(health_status$dc006s9), 0, ifelse(health_status$dc006s9 == 9, 1, 0))
  309. health_status$dc006s10_score <- ifelse(is.na(health_status$dc006s10), 0, ifelse(health_status$dc006s10 == 10, 1, 0))
  310. health_status$dc006s11_score <- ifelse(is.na(health_status$dc006s11), 0, ifelse(health_status$dc006s11 == 11, 1, 0))
  311. health_status$dc027s1_score <- ifelse(is.na(health_status$dc027s1), 0, ifelse(health_status$dc027s1 == 1, 1, 0))
  312. health_status$dc027s2_score <- ifelse(is.na(health_status$dc027s2), 0, ifelse(health_status$dc027s2 == 2, 1, 0))
  313. health_status$dc027s3_score <- ifelse(is.na(health_status$dc027s3), 0, ifelse(health_status$dc027s3 == 3, 1, 0))
  314. health_status$dc027s4_score <- ifelse(is.na(health_status$dc027s4), 0, ifelse(health_status$dc027s4 == 4, 1, 0))
  315. health_status$dc027s5_score <- ifelse(is.na(health_status$dc027s5), 0, ifelse(health_status$dc027s5 == 5, 1, 0))
  316. health_status$dc027s6_score <- ifelse(is.na(health_status$dc027s6), 0, ifelse(health_status$dc027s6 == 6, 1, 0))
  317. health_status$dc027s7_score <- ifelse(is.na(health_status$dc027s7), 0, ifelse(health_status$dc027s7 == 7, 1, 0))
  318. health_status$dc027s8_score <- ifelse(is.na(health_status$dc027s8), 0, ifelse(health_status$dc027s8 == 8, 1, 0))
  319. health_status$dc027s9_score <- ifelse(is.na(health_status$dc027s9), 0, ifelse(health_status$dc027s9 == 9, 1, 0))
  320. health_status$dc027s10_score <- ifelse(is.na(health_status$dc027s10), 0, ifelse(health_status$dc027s10 == 10, 1, 0))
  321. health_status$dc027s11_score <- ifelse(is.na(health_status$dc027s11), 0, ifelse(health_status$dc027s11 == 11, 1, 0))
  322. health_status$remenber_functioning <- apply(health_status[,c('dc006s1_score','dc006s2_score','dc006s3_score', 'dc006s4_score', 'dc006s5_score'
  323. ,'dc006s6_score','dc006s7_score','dc006s8_score','dc006s9_score','dc006s10_score'
  324. ,'dc006s11_score','dc027s1_score','dc027s2_score','dc027s3_score','dc027s4_score','dc027s5_score'
  325. ,'dc027s6_score','dc027s7_score','dc027s8_score','dc027s9_score','dc027s10_score','dc027s11_score')], 1, function(x) sum(x)/2)
  326. #画图
  327. if(year == "2020"){
  328. health_status$draw_score <- ifelse(is.na(health_status$dc009), 0, ifelse(health_status$dc009 == 1, 1, 0))
  329. }else{
  330. health_status$draw_score <- ifelse(is.na(health_status$dc025), 0, ifelse(health_status$dc025 == 1, 1, 0))
  331. }
  332. #心理得分
  333. if(year == '2020'){
  334. health_status$dc009_score <- health_status$dc016-1
  335. health_status$dc010_score <- health_status$dc017-1
  336. health_status$dc011_score <- health_status$dc018-1
  337. health_status$dc012_score <- health_status$dc019-1
  338. health_status$dc013_score <- 4 - health_status$dc020
  339. health_status$dc014_score <- health_status$dc021-1
  340. health_status$dc015_score <- health_status$dc022-1
  341. health_status$dc016_score <- 4 - health_status$dc023
  342. health_status$dc017_score <- health_status$dc024-1
  343. health_status$dc018_score <- health_status$dc025-1
  344. }else{
  345. health_status$dc009_score <- health_status$dc009-1
  346. health_status$dc010_score <- health_status$dc010-1
  347. health_status$dc011_score <- health_status$dc011-1
  348. health_status$dc012_score <- health_status$dc012-1
  349. health_status$dc013_score <- 4 - health_status$dc013
  350. health_status$dc014_score <- health_status$dc014-1
  351. health_status$dc015_score <- health_status$dc015-1
  352. health_status$dc016_score <- 4 - health_status$dc016
  353. health_status$dc017_score <- health_status$dc017-1
  354. health_status$dc018_score <- health_status$dc018-1
  355. }
  356. health_status$psychiatric_score <- apply(health_status[,c('dc009_score','dc010_score','dc011_score', 'dc012_score', 'dc013_score'
  357. ,'dc014_score','dc015_score','dc016_score','dc017_score','dc018_score')], 1, function(x) sum(x))
  358. health_status <- health_status[, c('ID','householdID', 'communityID','Cognitive_functioning','remenber_functioning'
  359. ,'draw_score','psychiatric_score')]
  360. colnames(health_status) <- c('ID', 'householdID', 'communityID','Cognitive_functioning','remenber_functioning'
  361. ,'draw_score','psychiatric_score')
  362. data <- merge(data, health_status, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
  363. #住院情况
  364. if (year != '2020'){
  365. health_care = health_care[, c('ID','householdID', 'communityID', 'ee003', 'ee004')]
  366. colnames(health_care) <- c('ID','householdID', 'communityID', 'received_inpatient_care',"Frequency_one_year")
  367. data <- merge(data, health_care, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
  368. }else{
  369. data['received_inpatient_care'] <- NA
  370. data['Frequency_one_year'] <- NA
  371. }
  372. #个人收入情况
  373. if (year == '2011' | year == '2013'){
  374. exp_income = exp_income[, c('ID','householdID', 'communityID','INDV_INCOME')]
  375. data <- merge(data, exp_income, by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
  376. }else {
  377. Individual_Income$INDV_INCOME <- ifelse(Individual_Income$ga001==2, 0, Individual_Income$ga002)
  378. data <- merge(data, Individual_Income[,c('ID','householdID', 'communityID','INDV_INCOME')], by = c('ID', 'householdID', 'communityID'), all.x = TRUE)
  379. # Household_Income$INCOME_TOTAL <- apply(Household_Income[,c('ga006_1_1_','ga006_1_2_','ga006_1_3_'
  380. # ,'ga006_1_4_','ga006_1_5_','ga006_1_6_','ga006_1_7_','ga006_1_8_','ga006_1_9_','ga006_1_10_')], 1, function(x) sum(x, na.rm = TRUE))
  381. }
  382. write.csv(data, file = paste0(path, "result", year, ".csv"), row.names = FALSE)
  383. #合并
  384. csv_files <- list.files(path = path, pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
  385. df_combined <- NA
  386. # 确保读取文件的路径是完整的
  387. if (length(csv_files) > 0) {
  388. for (file in csv_files) {
  389. # 读取每个.csv文件
  390. data <- read.csv(file, stringsAsFactors = FALSE)
  391. print(ncol(data))
  392. if (length(df_combined) == 0){
  393. df_combined <- data
  394. }else{
  395. df_combined <- rbind(data, df_combined)
  396. }
  397. print(paste("Read file:", file))
  398. }
  399. }
  400. write.csv(df_combined, file = paste0("/root/r_base/CHARLS/", "result_all", ".csv"), row.names = FALSE)