data_preprocess.r 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. # NHANES
  2. # install.packages("caret", repos="https://mirrors.ustc.edu.cn/CRAN/")
  3. library(caret)
  4. library(nhanesA)
  5. library(knitr)
  6. library(tidyverse)
  7. library(plyr)
  8. library(dplyr)
  9. library(foreign)
  10. path = "NHANES/2017-2018/"
  11. year = "_J"
  12. # year_ = "2001-2002"
  13. DEMO <- read.xport(paste0(path , "DEMO",year,".XPT"))
  14. #选取数据发布号、样本人员的访谈和检查状态、性别、年龄、种族、出生地
  15. #社会经济地位数据:家庭 PIR、家庭年度收入
  16. if(year =="" || year == "_B" || year == "_C" || year == "_D"){
  17. DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN, INDFMPIR, INDFMINC)
  18. }else if(year =="_E" || year == "_F"){
  19. DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN2, INDFMPIR, INDHHIN2)
  20. }else{
  21. DEMO <- DEMO %>% select(SEQN, SDDSRVYR, RIDSTATR, RIAGENDR, RIDAGEYR, RIDRETH1, DMDBORN4, INDFMPIR, INDHHIN2)
  22. }
  23. print(names(DEMO)[9])
  24. names(DEMO)[7] <- "DMDBORN"
  25. names(DEMO)[9] <- "INDFMINC"
  26. # #收入
  27. # if(year=="_E" | year=="_F" | year=="_H" | year=="_G" |year=="_I" | year=="_J"){
  28. # INQ <- read.xport(paste0(path, "INQ",year,".XPT"))
  29. # data <- join_all(list(data, INQ), by = "SEQN", type = "left")
  30. # }
  31. #重量、站立高度、体重指数
  32. BMX <- read.xport(paste0(path, "BMX",year,".XPT"))
  33. BMX <- BMX %>% select(SEQN, BMXWT , BMXHT , BMXBMI )
  34. data <- join_all(list(DEMO, BMX), by = "SEQN", type = "left")
  35. #收缩压平均值、舒张压平均值、60 秒心率
  36. BPX <- read.xport(paste0(path, "BPX",year,".XPT"))
  37. BPX <- BPX %>% select(SEQN, BPXSY1, BPXDI1, BPXCHR)
  38. data <- join_all(list(data, BPX), by = "SEQN", type = "left")
  39. #空腹血浆葡萄糖
  40. if(year ==""){
  41. LAB10AM <- read.xport(paste0(path, "LAB10AM",year,".XPT"))
  42. LAB10AM <- LAB10AM %>% select(SEQN, LBXGLUSI )
  43. }else if (year == "_B"){
  44. LAB10AM <- read.xport(paste0(path, "L10AM",year,".XPT"))
  45. LAB10AM <- LAB10AM %>% select(SEQN, LBXGLUSI )
  46. }else if (year == "_C"){
  47. LAB10AM <- read.xport(paste0(path, "L10AM",year,".XPT"))
  48. LAB10AM <- LAB10AM %>% select(SEQN, LBDGLUSI)
  49. }else{
  50. LAB10AM <- read.xport(paste0(path, "GLU",year,".XPT"))
  51. LAB10AM <- LAB10AM %>% select(SEQN, LBDGLUSI)
  52. }
  53. names(LAB10AM)[2] <- "LBDGLUSI"
  54. data <- join_all(list(data, LAB10AM), by = "SEQN", type = "left")
  55. #尿白蛋白
  56. if (year == ""){
  57. LAB16 <- read.xport(paste0(path, "LAB16",year,".XPT"))
  58. }else if(year == "_B" | year == "_C"){
  59. LAB16 <- read.xport(paste0(path, "L16",year,".XPT"))
  60. }else{
  61. LAB16 <- read.xport(paste0(path, "ALB_CR",year,".XPT"))
  62. }
  63. LAB16 <- LAB16 %>% select(SEQN, URXUMA)
  64. data <- join_all(list(data, LAB16), by = "SEQN", type = "left")
  65. #血常规:白细胞计数 (SI)1000 个细胞/uL、淋巴细胞计数、单核细胞数、红细胞计数 SI(百万细胞/uL)、血红蛋白 (g/dL)、血小板计数 (1000 个细胞/uL)
  66. #、嗜酸性粒细胞数、嗜碱性粒细胞数、Segmented neutrophils number(分叶中性粒细胞数量)1000 细胞/uL
  67. if (year == ""){
  68. LAB25 <- read.xport(paste0(path, "LAB25",year,".XPT"))
  69. }else if(year == "_B" | year == "_C"){
  70. LAB25 <- read.xport(paste0(path, "L25",year,".XPT"))
  71. }else{
  72. LAB25 <- read.xport(paste0(path, "CBC",year,".XPT"))
  73. }
  74. LAB25 <- LAB25 %>% select(SEQN, LBXWBCSI, LBDLYMNO, LBDMONO, LBXRBCSI, LBXHGB, LBXPLTSI, LBDEONO, LBDBANO, LBDNENO)
  75. #计算炎症指标
  76. LAB25$SIRI <- LAB25$LBDNENO*LAB25$LBDMONO/LAB25$LBDLYMNO
  77. LAB25$SII <- LAB25$LBXPLTSI*LAB25$LBDNENO/LAB25$LBDLYMNO
  78. data <- join_all(list(data, LAB25), by = "SEQN", type = "left")
  79. #血脂:甘油三酯(mmol/L))、低密度脂蛋白胆固醇(mmol/L))、总胆固醇、高密度脂蛋白胆固醇(mmol/L)、总胆固醇、甘油三酯
  80. #肾功能:血尿素氮、肌酐
  81. #肝功能:ALT、AST、总胆红素
  82. if (year == ""){
  83. LAB13AM <- read.xport(paste0(path, "LAB13AM",year,".XPT"))
  84. }else if(year == "_B" | year == "_C"){
  85. LAB13AM <- read.xport(paste0(path, "L13AM",year,".XPT"))
  86. }else{
  87. LAB13AM <- read.xport(paste0(path, "TRIGLY",year,".XPT"))
  88. }
  89. LAB13AM <- LAB13AM %>% select(SEQN, LBDTRSI, LBDLDLSI)
  90. data <- join_all(list(data, LAB13AM), by = "SEQN", type = "left")
  91. if(year ==""){
  92. LAB13 <- read.xport(paste0(path, "LAB13",year,".XPT"))
  93. }else if(year == "_B" | year == "_C"){
  94. LAB13 <- read.xport(paste0(path, "L13",year,".XPT"))
  95. }else{
  96. LAB13 <- read.xport(paste0(path, "TCHOL",year,".XPT"))
  97. }
  98. if(year=="" | year=="_B"){
  99. LAB13 <- LAB13 %>% select(SEQN, LBDTCSI, LBDHDLSI)
  100. }else if(year == "_C"){ #Direct HDL-Cholesterol
  101. LAB13 <- LAB13 %>% select(SEQN, LBDTCSI, LBDHDDSI)
  102. }else{
  103. LAB13 <- LAB13 %>% select(SEQN, LBDTCSI)
  104. HDL <- read.xport(paste0(path, "HDL",year,".XPT"))
  105. HDL <- HDL %>% select(SEQN, LBDHDDSI)
  106. LAB13 <- join_all(list(LAB13, HDL), by = "SEQN", type = "left")
  107. }
  108. names(LAB13)[3] <- "LBDHDDSI"
  109. data <- join_all(list(data, LAB13), by = "SEQN", type = "left")
  110. #计算炎症指标
  111. data$MHR <- data$LBDMONO/data$LBDHDDSI
  112. data$NHR <- data$LBDNENO/data$LBDHDDSI
  113. data$PHR <- data$LBXPLTSI/data$LBDHDDSI
  114. data$LHR <- data$LBDLYMNO /data$LBDHDDSI
  115. # 同型半胱氨酸(1999-2006),其他年份为空
  116. if(year == "" || year == "_D" || year == "_C" || year == "_B"){
  117. if(year == ""){
  118. LAB06 <- read.xport(paste0(path, "LAB06",year,".XPT"))
  119. LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
  120. }else if(year == "_B"){
  121. LAB06 <- read.xport(paste0(path, "L06",year,".XPT"))
  122. LAB06 <- LAB06 %>% select(SEQN, LBDHCY)
  123. }else if(year == "_C"){
  124. LAB06 <- read.xport(paste0(path, "L06MH",year,".XPT"))
  125. LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
  126. }else{
  127. LAB06 <- read.xport(paste0(path, "HCY",year,".XPT"))
  128. LAB06 <- LAB06 %>% select(SEQN, LBXHCY)
  129. }
  130. names(LAB06)[2] <- "LBXHCY"
  131. data <- join_all(list(data, LAB06), by = "SEQN", type = "left")
  132. }else{
  133. data$LBXHCY <- NA
  134. }
  135. if(year == ""){
  136. LAB18 <- read.xport(paste0(path, "LAB18",year,".XPT"))
  137. }else if(year == "_B" || year == "_C"){
  138. LAB18 <- read.xport(paste0(path, "L40",year,".XPT"))
  139. }else{
  140. LAB18 <- read.xport(paste0(path, "BIOPRO",year,".XPT"))
  141. }
  142. LAB18 <- LAB18 %>% select(SEQN, LBDSCHSI, LBDSTRSI, LBDSBUSI, LBDSCRSI, LBXSATSI, LBXSASSI, LBDSTBSI )
  143. data <- join_all(list(data, LAB18), by = "SEQN", type = "left")
  144. #甲状腺功能:促甲状腺激素
  145. if(year == "" || year == "_B" ||year =="_E" || year =="_F" || year =="_G"){
  146. if(year == ""){
  147. LAB18T4 <- read.xport(paste0(path, "LAB18T4",year,".XPT"))
  148. LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH)
  149. }else if(year == "_B"){
  150. LAB18T4 <- read.xport(paste0(path, "L40T4",year,".XPT"))
  151. LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH)
  152. }else if (year =="_E" || year =="_F" || year =="_G"){
  153. LAB18T4 <- read.xport(paste0(path, "THYROD",year,".XPT"))
  154. LAB18T4 <- LAB18T4 %>% select(SEQN, LBXTSH1)
  155. }
  156. names(LAB18T4)[2] <- "LBXTSH"
  157. data <- join_all(list(data, LAB18T4), by = "SEQN", type = "left")
  158. }else{
  159. data$LBXTSH <- NA
  160. }
  161. #甲状腺功能:游离甲状腺素(2007-2012)
  162. if(year == "_E" || year == "_F" || year == "_G"){
  163. THYROD <- read.xport(paste0(path, "THYROD",year,".XPT"))
  164. THYROD <- THYROD %>% select(SEQN, LBDT4FSI)
  165. data <- join_all(list(data, THYROD), by = "SEQN", type = "left")
  166. }else{
  167. data$LBDT4FSI <- NA
  168. }
  169. #吸烟史
  170. #成人 20+
  171. SMQ <- read.xport(paste0(path, "SMQ",year,".XPT"))
  172. SMQ <- SMQ %>% select(SEQN, SMQ020, SMD030, SMQ040, SMQ050Q, SMQ050U, SMD057)
  173. data <- join_all(list(data, SMQ), by = "SEQN", type = "left")
  174. #饮酒史 20+
  175. ALQ <- read.xport(paste0(path, "ALQ",year,".XPT"))
  176. if(year == "" || year == "_B" ||year =="_E" || year =="_F" || year =="_C" || year =="_D"){
  177. ALQ <- ALQ %>% select(SEQN, ALQ150, ALQ130)
  178. }else{
  179. ALQ <- ALQ %>% select(SEQN, ALQ151, ALQ130)
  180. }
  181. names(ALQ)[2] <- "ALQ150"
  182. data <- join_all(list(data, ALQ), by = "SEQN", type = "left")
  183. #饮食习惯
  184. DBQ <- read.xport(paste0(path, "DBQ",year,".XPT"))
  185. if(year == "" || year == "_B" ||year =="_E" || year =="_C" || year =="_D"){
  186. if(year == ""){
  187. DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBQ070A, DBQ070B, DBQ070C, DBQ390, DBD400, DBD410, DBQ420,
  188. DBQ070D, DBD195, DBQ220A, DBQ220B, DBQ220C, DBQ220D, DBD235A, DBD235B, DBD235C, DBQ300, DBQ330, DBD360, DBD370, DBD380)
  189. }else if (year == "_B"){
  190. DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBD071A, DBD071B, DBD071C, DBQ390, DBQ400, DBD411, DBD421,
  191. DBD071D, DBD196, DBD221A, DBD221B, DBD221C, DBD221D, DBD235AE, DBD235BE, DBD235CE, DBD301, DBQ330, DBQ360, DBQ370, DBD381)
  192. }else if(year == "_C"){
  193. DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBQ071A, DBQ071B, DBQ071C, DBQ390, DBQ400, DBD411, DBQ421,
  194. DBQ071D, DBD197, DBQ221A, DBQ221B, DBQ221C, DBQ221D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
  195. }else if(year == "_D" || year == "_E"){
  196. DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD040, DBD050, DBD060, DBD072A, DBD072B, DBD072C, DBQ390, DBQ400, DBD411, DBQ421,
  197. DBD072D, DBQ197, DBD222A, DBD222B, DBD222C, DBD222D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
  198. }
  199. }else{
  200. DBQ <- DBQ %>% select(SEQN, DBQ010, DBD030, DBD041, DBD050, DBD061, DBQ073A, DBQ073B, DBQ073C, DBQ390, DBQ400, DBD411, DBQ421,
  201. DBQ073D, DBQ197, DBQ223A, DBQ223B, DBQ223C, DBQ223D, DBQ235A, DBQ235B, DBQ235C, DBQ301, DBQ330, DBQ360, DBQ370, DBD381)
  202. }
  203. names(DBQ) <- c("SEQN", "DBQ010", "DBD030", "DBD041", "DBD050", "DBD061", "DBQ073A", "DBQ073B", "DBQ073C", "DBQ390", "DBQ400", "DBD411", "DBQ421",
  204. "DBQ073D", "DBQ197", "DBQ223A", "DBQ223B", "DBQ223C", "DBQ223D", "DBQ235A", "DBQ235B", "DBQ235C", "DBQ301", "DBQ330", "DBQ360", "DBQ370", "DBD381")
  205. data <- join_all(list(data, DBQ), by = "SEQN", type = "left")
  206. #体力活动
  207. PAQ <- read.xport(paste0(path, "PAQ",year,".XPT"))
  208. PAQ[is.na(PAQ)] <- 0
  209. if(year == ""){
  210. PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ480, PAQ560, PAD570, PAQ580)
  211. PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
  212. PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
  213. PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
  214. PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
  215. PAQ_s$PAQ480 <- ifelse(PAQ_s$PAQ480 < 6, 1.2 * PAQ_s$PAQ480, 0)
  216. PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
  217. PAQ_s$PAD570 <- ifelse(PAQ_s$PAD570 < 6, PAQ_s$PAD570, 0)
  218. PAQ_s$PAQ580 <- ifelse(PAQ_s$PAQ580 < 6, 1.5 * PAQ_s$PAQ580, 0)
  219. PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
  220. }else if(year =="_B"){
  221. PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ560, PAD590, PAD600)
  222. PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
  223. PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
  224. PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
  225. PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
  226. PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
  227. PAQ_s$PAD590 <- ifelse(PAQ_s$PAD590 < 6, PAQ_s$PAD590, 0)
  228. PAQ_s$PAD600 <- ifelse(PAQ_s$PAD600 < 6, 1.5 * PAQ_s$PAD600, 0)
  229. PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
  230. }else if(year =="_C" || year =="_D"){
  231. PAQ_s <- PAQ %>% select(SEQN, PAD020, PAQ100, PAQ180, PAD440, PAQ560, PAD590, PAD600)
  232. PAQ_s$PAD020 <- ifelse(PAQ_s$PAD020 == 1, 4, 0)
  233. PAQ_s$PAQ100 <- ifelse(PAQ_s$PAQ100 == 1, 4.5, 0)
  234. PAQ_s$PAQ180 <- ifelse(PAQ_s$PAQ180 == 1, 1.4, ifelse(PAQ_s$PAQ180 == 2, 1.5, ifelse(PAQ_s$PAQ180 == 3, 1.6, ifelse(PAQ_s$PAQ180 == 4, 1.8, 0))))
  235. PAQ_s$PAD440 <- ifelse(PAQ_s$PAD440 == 1, 4, 0)
  236. PAQ_s$PAQ560 <- ifelse(PAQ_s$PAQ560 < 78, 7 * PAQ_s$PAQ560, 0)
  237. PAQ_s$PAD590 <- ifelse(PAQ_s$PAD590 < 6, PAQ_s$PAD590, 0)
  238. PAQ_s$PAD600 <- ifelse(PAQ_s$PAD600 < 6, 1.5 * PAQ_s$PAD600, 0)
  239. PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
  240. }else{
  241. PAQ_s <- PAQ %>% select(SEQN, PAD615, PAD630, PAD645, PAD660, PAD675)
  242. PAQ_s$PAD615 <- ifelse(PAQ_s$PAD615 < 841, 8 * PAQ_s$PAD615, 0)
  243. PAQ_s$PAD630 <- ifelse(PAQ_s$PAD630 < 841, 4 * PAQ_s$PAD630, 0)
  244. PAQ_s$PAD645 <- ifelse(PAQ_s$PAD645 < 661, 4 * PAQ_s$PAD645, 0)
  245. PAQ_s$PAD660 <- ifelse(PAQ_s$PAD660 < 481, 8 * PAQ_s$PAD660, 0)
  246. PAQ_s$PAD675 <- ifelse(PAQ_s$PAD675 < 541, 4 * PAQ_s$PAD675, 0)
  247. PAQ$SCORE <- apply(PAQ_s[, -1], 1, function(x) sum(x))
  248. PAQ$PAAQUEX <- NA
  249. }
  250. PAQ <- PAQ %>% select(SEQN, SCORE, PAAQUEX)
  251. data <- join_all(list(data, PAQ), by = "SEQN", type = "left")
  252. #睡眠问题
  253. if(year != "" && year != "_B" && year != "_C"){
  254. SLQ <- read.xport(paste0(path, "SLQ",year,".XPT"))
  255. SLQ <- SLQ %>% select(SEQN, SLQ050)
  256. data <- join_all(list(data, SLQ), by = "SEQN", type = "left")
  257. }else{
  258. data$SLQ050 <- NA
  259. }
  260. #心理健康
  261. if (year =="" || year == "_B" || year == "_C"){
  262. #心理健康 - 抑郁症:抑郁评分
  263. if(year==""){
  264. CIQMDEP <- read.xport(paste0(path, "CIQMDEP",year,".XPT"))
  265. }else if (year == "_B" | year == "_C"){
  266. CIQMDEP <- read.xport(paste0(path, "CIQDEP",year,".XPT"))
  267. }
  268. CIQMDEP <- CIQMDEP %>% select(SEQN, CIDDSCOR)
  269. data <- join_all(list(data, CIQMDEP), by = "SEQN", type = "left")
  270. # #精神健康 - 广泛性焦虑症:焦虑评分
  271. # CIQGAD <- read.xport(paste0(path, "CIQGAD",year,".XPT"))
  272. # CIQGAD <- CIQGAD %>% select(SEQN, CIDGSCOR)
  273. # data <- join_all(list(data, CIQGAD), by = "SEQN", type = "left")
  274. # #心理健康 - 恐慌症:恐慌评分
  275. # if(year==""){
  276. # CIQPANIC <- read.xport(paste0(path, "CIQPANIC",year,".XPT"))
  277. # }else{
  278. # CIQPANIC <- read.xport(paste0(path, "CIQPAN",year,".XPT"))
  279. # }
  280. # CIQPANIC <- CIQPANIC %>% select(SEQN, CIDPSCOR)
  281. # data <- join_all(list(data, CIQPANIC), by = "SEQN", type = "left")
  282. }else{
  283. DPQ <- read.xport(paste0(path, "DPQ",year,".XPT"))
  284. DPQ <- DPQ[, -which(names(DPQ) == "DPQ100")]
  285. DPQ <- na.omit(DPQ)
  286. DPQ$CIDDSCOR <- apply(DPQ[, -1], 1, function(x) sum(x[x < 7]))
  287. DPQ <- DPQ %>% select(SEQN, CIDDSCOR)
  288. data <- join_all(list(data, DPQ), by = "SEQN", type = "left")
  289. }
  290. #社会支持
  291. if(year =="" || year == "_B" || year == "_C"){
  292. SSQ <- read.xport(paste0(path, "SSQ",year,".XPT"))
  293. }else if(year =="_D" || year == "_E"){
  294. SSQ <- read.xport(paste0(path, "SSQ",year,".XPT"))
  295. SSQ <- SSQ[, !(names(SSQ) == "SSD044")]
  296. }else{
  297. SSQ[c("SEQN", "SSQ011", "SSQ021A", "SSQ021B", "SSQ021C", "SSQ021D", "SSQ021E", "SSQ021F", "SSQ021G", "SSQ021H", "SSQ021I", "SSQ021J", "SSQ021K",
  298. "SSQ021L", "SSQ021M", "SSQ021N", "SSQ031", "SSQ041", "SSQ051", "SSQ061")] <- NA
  299. }
  300. names(SSQ) <- c("SEQN", "SSQ011", "SSQ021A", "SSQ021B", "SSQ021C", "SSQ021D", "SSQ021E", "SSQ021F", "SSQ021G", "SSQ021H", "SSQ021I", "SSQ021J", "SSQ021K",
  301. "SSQ021L", "SSQ021M", "SSQ021N", "SSQ031", "SSQ041", "SSQ051", "SSQ061")
  302. data <- join_all(list(data, SSQ), by = "SEQN", type = "left")
  303. #疾病诊断记录(有高血压、冠心病、心绞痛、心力衰竭、心脏病发作)
  304. MCQ <- read.xport(paste0(path, "MCQ",year,".XPT"))
  305. if(year =="" || year == "_B" || year == "_C" || year =="_D" || year == "_E" || year == "_F"){
  306. MCQ <- MCQ %>% select(SEQN, MCQ160B, MCQ180B, MCQ160C, MCQ180C, MCQ160D, MCQ180D, MCQ160E, MCQ180E)
  307. }else if(year =="_G" || year == "_H" || year == "_I" ){
  308. MCQ <- MCQ %>% select(SEQN, MCQ160B, MCQ180B, MCQ160C, MCQ180C, MCQ160D, MCQ180D, MCQ160E, MCQ180E)
  309. }else{
  310. MCQ <- MCQ %>% select(SEQN, MCQ160B, MCD180B, MCQ160C, MCD180C, MCQ160D, MCD180D, MCQ160E, MCD180E)
  311. }
  312. names(MCQ) <- c("SEQN", "MCQ160B", "MCD180B", "MCQ160C", "MCD180C", "MCQ160D", "MCD180D", "MCQ160E", "MCD180E")
  313. data <- join_all(list(data, MCQ), by = "SEQN", type = "left")
  314. #用药记录,天数、用量
  315. #心内常用药
  316. medicine = c("HYDROCHLOROTHIAZIDE", "VALSARTAN", "AMIODARONE", "AMLODIPINE", "ATORVASTATIN",
  317. "BENAZEPRIL", "OLMESARTAN", "PERINDOPRIL","TELMISARTAN","BENAZEPRIL","BENDROFLUMETHIAZIDE",
  318. "BEZAFIBRATE","BISOPROLOL","Captopril","Carvedilol", "EPINEPHRINE", "QUINIDINE", "Digoxin",
  319. "Diltiazem", "Enalapril", "Eprosartan", "ERTUGLIFLOZIN", "SITAGLIPTIN", "EZETIMIBE", "SIMVASTATIN",
  320. "FENOFIBRIC ACID", "Furosemide", "GLICLAZIDE", "GLIPIZIDE", "METFORMIN", "HYDRALAZINE", "ISOSORBIDE DINITRATE",
  321. "IRBESARTAN", "METOPROLOL", "PROPRANOLOL", "ISOSORBIDE MONONITRATE", "NIFEDIPINE", "NITROGLYCERIN",
  322. "PITAVASTATIN", "PRAVASTATIN", "PRAZOSIN", "PROCAINAMIDE", "PROPAFENONE", "SACUBITRIL", "SOTALOL", "SPIRONOLACTONE",
  323. "TICAGRELOR", "TORSEMIDEMIDE", "VERAPAMIL", "TRIAMTERENE", "WARFARIN")
  324. RXQ_RX <- read.xport(paste0(path, "RXQ_RX",year,".XPT"))
  325. if(year == "" || year == "_B"){
  326. day_RXQ_RX <- RXQ_RX %>%
  327. select(SEQN, RXD240B, RXD260) %>%
  328. na.omit(.) %>%
  329. pivot_wider(names_from = RXD240B, values_from = RXD260,
  330. names_prefix = "day_",values_fill = 0, values_fn = list(RXD260 = sum))
  331. number_RXQ_RX <- RXQ_RX %>%
  332. select(SEQN, RXD240B, RXD295) %>%
  333. na.omit(.) %>%
  334. pivot_wider(names_from = RXD240B, values_from = RXD295,
  335. names_prefix = "number_",values_fill = 0, values_fn = list(RXD295 = sum))
  336. }else{
  337. day_RXQ_RX <- RXQ_RX %>%
  338. select(SEQN, RXDDRUG, RXDDAYS) %>%
  339. na.omit(.) %>%
  340. pivot_wider(names_from = RXDDRUG, values_from = RXDDAYS,
  341. names_prefix = "day_",values_fill = 0, values_fn = list(RXDDAYS = sum))
  342. number_RXQ_RX <- RXQ_RX %>%
  343. select(SEQN, RXDDRUG, RXDCOUNT) %>%
  344. na.omit(.) %>%
  345. pivot_wider(names_from = RXDDRUG, values_from = RXDCOUNT,
  346. names_prefix = "number_",values_fill = 0, values_fn = list(RXDCOUNT = sum))
  347. }
  348. for (item in medicine){
  349. med_columns <- grep(item, names(day_RXQ_RX), value = TRUE)
  350. tmp = day_RXQ_RX[, c("SEQN", med_columns)]
  351. tmp[[paste0("day_", item)]] <- apply(tmp[, -1], 1, function(x) sum(x))
  352. tmp <- tmp %>% select(SEQN, paste0("day_", item))
  353. data <- join_all(list(data, tmp), by = "SEQN", type = "left")
  354. med_columns <- grep(item, names(number_RXQ_RX), value = TRUE)
  355. tmp = number_RXQ_RX[, c("SEQN", med_columns)]
  356. tmp[[paste0("num_", item)]] <- apply(tmp[, -1], 1, function(x) sum(x))
  357. tmp <- tmp %>% select(SEQN, paste0("num_", item))
  358. data <- join_all(list(data, tmp), by = "SEQN", type = "left")
  359. }
  360. #就诊情况
  361. HUQ <- read.xport(paste0(path, "HUQ",year,".XPT"))
  362. if(year == "" || year == "_B" ||year =="_E" || year =="_C" || year =="_D" || year =="_F"|| year =="_G"){
  363. if(year == ""){
  364. HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUQ070, HUD080, HUQ090)
  365. }else if(year == "_B"){
  366. HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUD070, HUQ080, HUQ090)
  367. }else{
  368. HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ040, HUQ050, HUQ060, HUQ071, HUD080, HUQ090)
  369. }
  370. }else{
  371. HUQ <- HUQ %>% select(SEQN, HUQ010, HUQ020, HUQ030, HUQ041, HUQ051, HUQ061, HUQ071, HUD080, HUQ090)
  372. }
  373. names(HUQ) <- c("SEQN", "HUQ010", "HUQ020", "HUQ030", "HUQ041", "HUQ051", "HUQ061", "HUQ071", "HUD080", "HUQ090")
  374. data <- join_all(list(data, HUQ), by = "SEQN", type = "left")
  375. #健康状况 胃病、肠病、HIV病毒、流感等 最近献血
  376. HSQ <- read.xport(paste0(path, "HSQ",year,".XPT"))
  377. if(year == ""){
  378. HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSQ570, HSQ580, HSQ590, HSAQUEX)
  379. }else if(year == "_B"){
  380. HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSD570, HSQ580, HSQ590, HSAQUEX)
  381. }else{
  382. HSQ <- HSQ %>% select(SEQN, HSQ500, HSQ510, HSQ520, HSQ571, HSQ580, HSQ590, HSAQUEX)
  383. }
  384. names(HSQ) <- c("SEQN", "HSQ500", "HSQ510", "HSQ520", "HSQ571", "HSQ580", "HSQ590", "HSAQUEX")
  385. data <- join_all(list(data, HSQ), by = "SEQN", type = "left")
  386. #身体机能
  387. PFQ <- read.xport(paste0(path, "PFQ",year,".XPT"))
  388. if(year == ""){
  389. PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFQ040, PFQ048, PFQ050, PFQ055, PFQ056,
  390. PFQ059, PFQ060A, PFQ060B, PFQ060C,PFQ060D,PFQ060E,PFQ060F,PFQ060G,PFQ060H,PFQ060I,PFQ060J,PFQ060K,
  391. PFQ060L,PFQ060M,PFQ060N,PFQ060O,PFQ060P,PFQ060Q,PFQ060R,PFQ060S, PFD067A, PFD067B,PFD067C,PFD067D,
  392. PFD067E, PFQ090)
  393. }else if(year == "_B"){
  394. PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFD040, PFQ048, PFQ050, PFQ055, PFQ056,
  395. PFQ059, PFQ060A, PFQ060B, PFQ060C,PFQ060D,PFQ060E,PFQ060F,PFQ060G,PFQ060H,PFQ060I,PFQ060J,PFQ060K,
  396. PFQ060L,PFQ060M,PFQ060N,PFQ060O,PFQ060P,PFQ060Q,PFQ060R,PFQ060S, PFD067A, PFD067B,PFD067C,PFD067D,
  397. PFD067E, PFQ090)
  398. }else{
  399. PFQ <- PFQ %>% select(SEQN, PFQ020, PFQ030, PFQ041, PFQ049, PFQ051, PFQ054, PFQ057,
  400. PFQ059, PFQ061A, PFQ061B,PFQ061C,PFQ061D,PFQ061E,PFQ061F,PFQ061G,PFQ061H,PFQ061I,PFQ061J,PFQ061K,
  401. PFQ061L,PFQ061M,PFQ061N,PFQ061O,PFQ061P,PFQ061Q, PFQ061R,PFQ061S, PFQ063A, PFQ063B,PFQ063C,PFQ063D,
  402. PFQ063E, PFQ090)
  403. }
  404. names(PFQ) <- c("SEQN", "PFQ020", "PFQ030", "PFQ041", "PFQ049", "PFQ051", "PFQ054", "PFQ057",
  405. "PFQ059", "PFQ061A", "PFQ061B","PFQ061C","PFQ061D","PFQ061E","PFQ061F","PFQ061G","PFQ061H","PFQ061I","PFQ061J","PFQ061K",
  406. "PFQ061L","PFQ061M","PFQ061N","PFQ061O","PFQ061P","PFQ061Q", "PFQ061R","PFQ061S", "PFQ063A", "PFQ063B","PFQ063C","PFQ063D",
  407. "PFQ063E", "PFQ090")
  408. data <- join_all(list(data, PFQ), by = "SEQN", type = "left")
  409. #认知功能
  410. if(year=="" | year=="_B" | year=="_H" | year=="_G"){
  411. CFQ <- read.xport(paste0(path, "CFQ",year,".XPT"))
  412. if(year == "" || year == "_B"){
  413. CFQ <- CFQ %>% select(SEQN, CFDRIGHT)
  414. }else{
  415. CFQ <- CFQ %>% select(SEQN, CFDDS)
  416. }
  417. names(CFQ) <- c("SEQN", "CFDDS")
  418. data <- join_all(list(data, CFQ), by = "SEQN", type = "left")
  419. }else{
  420. data$CFDDS <- NA
  421. }
  422. mortality_result = read.csv(file = paste0(path, "mortality_result.csv"))
  423. data <- join_all(list(data, mortality_result), by = "SEQN", type = "left")
  424. write.csv(data, file = paste0(path, "result", year, ".csv"), row.names = FALSE)
  425. csv_files <- list.files(path = "NHANES", pattern = "\\.csv$", recursive = TRUE, full.names = TRUE)
  426. df_combined <- NA
  427. # 确保读取文件的路径是完整的
  428. if (length(csv_files) > 0) {
  429. for (file in csv_files) {
  430. if (grepl("mortality", file)){
  431. next
  432. }
  433. # 读取每个.csv文件
  434. data <- read.csv(file, stringsAsFactors = FALSE)
  435. print(ncol(data))
  436. if (length(df_combined) == 0){
  437. df_combined <- data
  438. }else{
  439. df_combined <- rbind(data, df_combined)
  440. }
  441. # 这里可以添加代码进行数据处理
  442. print(paste("Read file:", file))
  443. }
  444. }
  445. write.csv(df_combined, file = paste0("NHANES/", "result_all", ".csv"), row.names = FALSE)
  446. # data = read.csv(file = paste0(path, "result.csv"))
  447. # head(data)
  448. #加入污染物
  449. library(ncdf4)
  450. library(raster)
  451. nc <- nc_open("CHAP_PM2.5_Y1K_2009_V4.nc")
  452. data <- ncvar_get(nc, "PM2.5")
  453. print(data)
  454. lon <- ncvar_get(nc, "lon")
  455. tori <- ncvar_get(nc, "time")
  456. tunites <- ncatt_get(nc, "time", )
  457. writeRaster(x = nc, filename = 'pre1.tif', format='GTiff', overwrite=TRUE)
  458. # China_Health_and_Retirement_Longitudinal_Study_CHARLS
  459. # demo_china <- read.dta("CHARLS/CHARLS2011_Dataset/demographic_background.dta")
  460. # household_income_china <- read.dta("CHARLS/CHARLS2011_Dataset/household_income.dta")
  461. # # UKDA
  462. # demo_UK <- read.dta("UKDA-5050-stata/stata/stata13_se/wave_0_1998_data.dta")