HRS_preprocess.py 119 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789
  1. import pandas as pd
  2. import math
  3. import numpy as np
  4. if __name__ == "__main__":
  5. # age
  6. # sex
  7. # marital status
  8. # education
  9. # smoking status
  10. # drinking status
  11. # physical activity level
  12. # body mass index (BMI)
  13. # glycated haemoglobin (HbA1c)
  14. # systolic blood pressure (SBP)
  15. # high-density lipoprotein cholesterol (HDL-C)
  16. # C-reactive protein
  17. # 获取1992数据
  18. with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
  19. HHID_list = []
  20. PN_list = []
  21. BIRTH_YEAR_list = []
  22. SEX_list = []
  23. MARITAL_STATUS_list = []
  24. EDUCATION_list = []
  25. SMOKED_list = []
  26. DRINKED_list = []
  27. PHYSICAL_ACTIVITY_LEVEL_list = []
  28. BMI_list = []
  29. HEART_PROBL_list = []
  30. STROKE_list=[]
  31. # 逐行读取文件
  32. for line in file:
  33. HHID = line[0:6]
  34. PN = line[6:9]
  35. BIRTH_YEAR = line[249:254]
  36. # 1.Male
  37. # 2.Female
  38. SEX = line[109:110]
  39. # 1. Married [Inap in V228-V238]
  40. # 2. Partner [Inap in V226-V227]
  41. # 3. Separated [Inap in V226-V234]
  42. # 4. Divorced [Inap in V226-V234]
  43. # 5. Widowed [Inap in V226-V234]
  44. # 6. Never married
  45. # 7. Married with 2 family residences--both
  46. # sampleable
  47. # 8. Married with 2 family residences--one
  48. # residence is not sampleable (institution
  49. # or out of the country)
  50. # 9. NA
  51. MARITAL_STATUS = line[302:303]
  52. # 1 Married or Partner; 5 other
  53. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan
  54. # 0 For no formal education
  55. # 1-11 .....Grades
  56. # 12 .......High school
  57. # 13-15 ....Some college
  58. # 16 .......College grad
  59. # 17 .......Post college (17+ years)
  60. # 97 .......Other
  61. EDUCATION = line[264:266]
  62. # 1. Yes
  63. # 5. No [Inap in V502-V505]
  64. SMOKED = line[519:520]
  65. # 1. Yes
  66. # 5. No [Inap in V507]
  67. DRINKED = line[527:528]
  68. # 3. vigorous (vigorous activity more than once a week)
  69. # 2. moderate (moderate activity more than once a week)
  70. # 1. inactive (the rest)
  71. # 重度活动
  72. VIGOROUS_PHYSICAL = line[534:535]
  73. # 轻度活动
  74. LIGHT_PHYSICAL = line[533:534]
  75. PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL =="2" else 2 if LIGHT_PHYSICAL=="1" or LIGHT_PHYSICAL=="2" else 1
  76. # 体重
  77. WEIGH= float(line[536:539])*0.45359237
  78. # 身高
  79. HIGHT = float(line[542:543])*0.3048 + float(line[543:545])*0.0254
  80. # BMI
  81. BMI = WEIGH / math.pow(HIGHT,2)
  82. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  83. # 1. Yes
  84. # 5. No [Inap in V418-V421]
  85. HEART_PROBL = line[459:460]
  86. # STROKE
  87. STROKE = line[473:474]
  88. HHID_list.append(HHID)
  89. PN_list.append(PN)
  90. BIRTH_YEAR_list.append(BIRTH_YEAR)
  91. SEX_list.append(SEX)
  92. MARITAL_STATUS_list.append(MARITAL_STATUS)
  93. EDUCATION_list.append(EDUCATION)
  94. SMOKED_list.append(SMOKED)
  95. DRINKED_list.append(DRINKED)
  96. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  97. BMI_list.append(BMI)
  98. HEART_PROBL_list.append(HEART_PROBL)
  99. STROKE_list.append(STROKE)
  100. data = {
  101. "HHID":HHID_list,
  102. "PN":PN_list,
  103. "BIRTH_YEAR":BIRTH_YEAR_list,
  104. "SEX":SEX_list,
  105. "MARITAL_STATUS":MARITAL_STATUS_list,
  106. "EDUCATION":EDUCATION_list,
  107. "SMOKED":SMOKED_list,
  108. "DRINKED":DRINKED_list,
  109. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  110. "BMI":BMI_list,
  111. "HEART_PROBL":HEART_PROBL_list,
  112. "STROKE":STROKE_list
  113. }
  114. data["WAVE"] = 1992
  115. result = pd.DataFrame(data)
  116. # 获取1993数据
  117. with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
  118. HHID_list = []
  119. PN_list = []
  120. BIRTH_YEAR_list = []
  121. SEX_list = []
  122. MARITAL_STATUS_list = []
  123. EDUCATION_list = []
  124. SMOKED_list = []
  125. DRINKED_list = []
  126. PHYSICAL_ACTIVITY_LEVEL_list = []
  127. BMI_list = []
  128. HEART_PROBL_list = []
  129. STROKE_list=[]
  130. # 逐行读取文件
  131. for line in file:
  132. HHID = line[0:6]
  133. PN = line[6:9]
  134. BIRTH_YEAR = line[61:65]
  135. # 1.Male
  136. # 2.Female
  137. SEX = line[16:17]
  138. # MARRIED, SPOUSE PRESENT........... 1
  139. # MARRIED, SPOUSE ABSENT............ 2
  140. # LIVING WITH SOMEONE............... 3 GO TO A11b
  141. # DIVORCED/SEPARATED................ 4 GO TO A11g
  142. # WIDOWED........................... 5 GO TO A11g
  143. # NEVER MARRIED..................... 6 GO TO B1
  144. MARITAL_STATUS = line[98:99]
  145. # 1 Married or Partner; 5 other
  146. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5"
  147. # 0 For no formal education
  148. # 1-11 .....Grades
  149. # 12 .......High school
  150. # 13-15 ....Some college
  151. # 16 .......College grad
  152. # 17 .......Post college (17+ years)
  153. # 97 .......Other
  154. EDUCATION = line[74:76]
  155. #CURRENT SMOKER..................... 1
  156. # FORMER SMOKER...................... 2 GO TO B20
  157. # NEVER SMOKED....................... 3 GO TO B20
  158. # 1. Yes
  159. # 5. No [Inap in V502-V505]
  160. SMOKED = "1" if line[172:173] == "1" or line[172:173] == "2" else "5"
  161. # 1. Yes
  162. # 5. No [Inap in V507]
  163. DRINKED = line[176:177]
  164. # 3. vigorous (vigorous activity more than once a week)
  165. # 2. moderate (moderate activity more than once a week)
  166. # 1. inactive (the rest)
  167. # 重度活动
  168. VIGOROUS_PHYSICAL = np.nan
  169. # 轻度活动
  170. LIGHT_PHYSICAL = np.nan
  171. PHYSICAL_ACTIVITY_LEVEL = np.nan
  172. # 体重
  173. WEIGH= float(line[179:182])*0.45359237
  174. # 身高
  175. HEIGHT = float(line[182:184])*0.0254
  176. # BMI
  177. BMI = WEIGH / math.pow(HEIGHT,2)
  178. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  179. # 1. Yes
  180. # 5. No [Inap in V418-V421]
  181. HEART_PROBL = line[139:140]
  182. # STROKE
  183. STROKE = line[142:143]
  184. HHID_list.append(HHID)
  185. PN_list.append(PN)
  186. BIRTH_YEAR_list.append(BIRTH_YEAR)
  187. SEX_list.append(SEX)
  188. MARITAL_STATUS_list.append(MARITAL_STATUS)
  189. EDUCATION_list.append(EDUCATION)
  190. SMOKED_list.append(SMOKED)
  191. DRINKED_list.append(DRINKED)
  192. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  193. BMI_list.append(BMI)
  194. HEART_PROBL_list.append(HEART_PROBL)
  195. STROKE_list.append(STROKE)
  196. data = {
  197. "HHID":HHID_list,
  198. "PN":PN_list,
  199. "BIRTH_YEAR":BIRTH_YEAR_list,
  200. "SEX":SEX_list,
  201. "MARITAL_STATUS":MARITAL_STATUS_list,
  202. "EDUCATION":EDUCATION_list,
  203. "SMOKED":SMOKED_list,
  204. "DRINKED":DRINKED_list,
  205. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  206. "BMI":BMI_list,
  207. "HEART_PROBL":HEART_PROBL_list,
  208. "STROKE":STROKE_list
  209. }
  210. data["WAVE"] = 1993
  211. result_1993 = pd.DataFrame(data)
  212. result = pd.concat([result, result_1993], axis=0)
  213. # 获取1994数据
  214. with(open("/root/r_base/HRS/1994/data/W2a.da", "r", encoding="utf-8") )as file:
  215. HHID_list = []
  216. PN_list = []
  217. BIRTH_YEAR_list = []
  218. SEX_list = []
  219. MARITAL_STATUS_list = []
  220. EDUCATION_list = []
  221. # 逐行读取文件
  222. for line in file:
  223. HHID = line[0:6]
  224. PN = line[6:9]
  225. BIRTH_YEAR = line[26:30]
  226. # 1.Male
  227. # 2.Female
  228. SEX = line[22:23]
  229. # 1. Married [Inap in V228-V238]
  230. # 2. Partner [Inap in V226-V227]
  231. # 3. Separated [Inap in V226-V234]
  232. # 4. Divorced [Inap in V226-V234]
  233. # 5. Widowed [Inap in V226-V234]
  234. # 6. Never married
  235. # 7. Married (Not Institutionalized/not out of country)
  236. # 8. Married (Institutionalized/out of country)
  237. # 9. NA
  238. MARITAL_STATUS = line[55:57]
  239. # 1 Married or Partner; 5 other
  240. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan
  241. # 0 For no formal education
  242. # 1-11 .....Grades
  243. # 12 .......High school
  244. # 13-15 ....Some college
  245. # 16 .......College grad
  246. # 17 .......Post college (17+ years)
  247. # 97 .......Other
  248. # 98. Don't Know; DK
  249. # 99. Not Ascertained; NA
  250. EDUCATION = line[112:115]
  251. HHID_list.append(HHID)
  252. PN_list.append(PN)
  253. BIRTH_YEAR_list.append(BIRTH_YEAR)
  254. SEX_list.append(SEX)
  255. MARITAL_STATUS_list.append(MARITAL_STATUS)
  256. EDUCATION_list.append(EDUCATION)
  257. data = {
  258. "HHID":HHID_list,
  259. "PN":PN_list,
  260. "BIRTH_YEAR":BIRTH_YEAR_list,
  261. "SEX":SEX_list,
  262. "MARITAL_STATUS":MARITAL_STATUS_list,
  263. "EDUCATION":EDUCATION_list,
  264. }
  265. data["WAVE"] = 1994
  266. result_1994_one = pd.DataFrame(data)
  267. with(open("/root/r_base/HRS/1994/data/W2B.DA", "r", encoding="utf-8") )as file:
  268. HHID_list = []
  269. PN_list = []
  270. SMOKED_list = []
  271. DRINKED_list = []
  272. PHYSICAL_ACTIVITY_LEVEL_list = []
  273. BMI_list = []
  274. HEART_PROBL_list = []
  275. STROKE_list=[]
  276. # 逐行读取文件
  277. for line in file:
  278. HHID = line[0:6]
  279. PN = line[6:9]
  280. #1. Yes
  281. #5. No [GO TO B41]
  282. #8. Don't Know; DK [GO TO B41]
  283. #9. Refused; RF [GO TO B41]
  284. #0. Inap.
  285. # Proxy interview for deceased Wave-1 R
  286. SMOKED = line[356:358]
  287. #1. Yes
  288. #5. No [GO TO B41]
  289. #8. Don't Know; DK [GO TO B41]
  290. #9. Refused; RF [GO TO B41]
  291. #0. Inap.
  292. # Proxy interview for deceased Wave-1 R
  293. DRINKED = line[367:369]
  294. # 3. vigorous (vigorous activity more than once a week)
  295. # 2. moderate (moderate activity more than once a week)
  296. # 1. inactive (the rest)
  297. # 活动单位
  298. # 02. Week
  299. # 04. Month
  300. # 06. Year
  301. # 07. Other (specify)
  302. # 11. Day
  303. # 98. Don't Know/Not Ascertained; DK/NA
  304. # 99. Refused; RF
  305. # 00. Inap.
  306. # Proxy interview for deceased Wave-1 R
  307. # [B42a: or B42=995-999]
  308. # [B43a: or B43=995-999]
  309. # 重度活动
  310. VIGOROUS_PHYSICAL = line[378:382]
  311. VIGOROUS_UNIT = line[382:385]
  312. VIGOROUS_PHYSICAL_FLAG = np.nan
  313. if VIGOROUS_UNIT == "02" and float(VIGOROUS_PHYSICAL)>0 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
  314. VIGOROUS_PHYSICAL_FLAG = True
  315. if VIGOROUS_UNIT == "04" and float(VIGOROUS_PHYSICAL)>3 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
  316. VIGOROUS_PHYSICAL_FLAG = True
  317. if VIGOROUS_UNIT == "06" and float(VIGOROUS_PHYSICAL)>51 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
  318. VIGOROUS_PHYSICAL_FLAG = True
  319. if VIGOROUS_PHYSICAL=="00":
  320. VIGOROUS_PHYSICAL_FLAG = False
  321. # 轻度活动
  322. LIGHT_PHYSICAL = line[371:375]
  323. LIGHT_UNIT = line[375:378]
  324. # 判断是否符合轻运动 1符合;0不符合
  325. LIGHT_PHYSICAL_FLAG = np.nan
  326. if LIGHT_UNIT == "02" and float(LIGHT_PHYSICAL)>0 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
  327. LIGHT_PHYSICAL_FLAG = True
  328. if LIGHT_UNIT == "04" and float(LIGHT_PHYSICAL)>3 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
  329. LIGHT_PHYSICAL_FLAG = True
  330. if LIGHT_UNIT == "06" and float(LIGHT_PHYSICAL)>51 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
  331. LIGHT_PHYSICAL_FLAG = True
  332. if LIGHT_PHYSICAL=="00":
  333. LIGHT_PHYSICAL_FLAG = False
  334. PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL_FLAG == True else 2 if LIGHT_PHYSICAL_FLAG==True else 1 if LIGHT_PHYSICAL_FLAG==False or VIGOROUS_PHYSICAL_FLAG==False else np.nan
  335. # 体重
  336. WEIGH= float(line[385:389])*0.45359237 if not float(line[385:389])>500 else np.nan
  337. # 身高
  338. HIGHT = float(line[389:392])*0.3048 + float(line[392:395])*0.0254 if not float(line[389:392])>95 and not float(line[392:395])>95 else np.nan
  339. # BMI
  340. BMI = WEIGH / math.pow(HIGHT,2) if not np.isnan(WEIGH) and not np.isnan(HIGHT) else np.nan
  341. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  342. # 1. Yes
  343. # 5. No [Inap in V418-V421]
  344. HEART_PROBL = line[147:149] if not line[147:149]=="8" and not line[147:149]=="9" and not line[147:149]=="0" else np.nan
  345. # STROKE
  346. STROKE = line[173:175] if not line[173:175]=="8" and not line[173:175]=="9" and not line[173:175]=="0" else np.nan
  347. HHID_list.append(HHID)
  348. PN_list.append(PN)
  349. SMOKED_list.append(SMOKED)
  350. DRINKED_list.append(DRINKED)
  351. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  352. BMI_list.append(BMI)
  353. HEART_PROBL_list.append(HEART_PROBL)
  354. STROKE_list.append(STROKE)
  355. data = {
  356. "HHID":HHID_list,
  357. "PN":PN_list,
  358. "SMOKED":SMOKED_list,
  359. "DRINKED":DRINKED_list,
  360. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  361. "BMI":BMI_list,
  362. "HEART_PROBL":HEART_PROBL_list,
  363. "STROKE":STROKE_list
  364. }
  365. result_1994_two = pd.DataFrame(data)
  366. result_1994 = pd.merge(result_1994_one, result_1994_two, on=["HHID", "PN"], how="left")
  367. result = pd.concat([result, result_1994], axis=0)
  368. print(result.head())
  369. # 获取1995数据
  370. with(open("/root/r_base/HRS/1995/data/A95A_R.DA", "r", encoding="utf-8") )as file:
  371. HHID_list = []
  372. PN_list = []
  373. BIRTH_YEAR_list = []
  374. MARITAL_STATUS_list = []
  375. EDUCATION_list = []
  376. # 逐行读取文件
  377. for line in file:
  378. HHID = line[0:6]
  379. PN = line[6:9]
  380. BIRTH_YEAR = line[30:34]
  381. # MARRIED, SPOUSE PRESENT........... 1
  382. # MARRIED, SPOUSE ABSENT............ 2
  383. # LIVING WITH SOMEONE............... 3 GO TO A11b
  384. # DIVORCED/SEPARATED................ 4 GO TO A11g
  385. # WIDOWED........................... 5 GO TO A11g
  386. # NEVER MARRIED..................... 6 GO TO B1
  387. # 0. Exit proxy was taken before the interview with the surviving spouse.
  388. # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION)
  389. MARITAL_STATUS = line[76:77]
  390. # 1 Married or Partner; 5 other
  391. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else np.nan if MARITAL_STATUS=="0" or MARITAL_STATUS=="7" else "5"
  392. # 0 For no formal education
  393. # 1-11 .....Grades
  394. # 12 .......High school
  395. # 13-15 ....Some college
  396. # 16 .......College grad
  397. # 17 .......Post college (17+ years)
  398. # 97 .......Other
  399. # 98. DK (don't know); NA (not ascertained)
  400. # 99. RF (refused)
  401. EDUCATION = np.nan if line[49:51] == "97" or line[49:51] == "98" or line[49:51] == "99" else line[49:51]
  402. HHID_list.append(HHID)
  403. PN_list.append(PN)
  404. BIRTH_YEAR_list.append(BIRTH_YEAR)
  405. MARITAL_STATUS_list.append(MARITAL_STATUS)
  406. EDUCATION_list.append(EDUCATION)
  407. data = {
  408. "HHID":HHID_list,
  409. "PN":PN_list,
  410. "BIRTH_YEAR":BIRTH_YEAR_list,
  411. "MARITAL_STATUS":MARITAL_STATUS_list,
  412. "EDUCATION":EDUCATION_list,
  413. }
  414. data["WAVE"] = 1995
  415. result_1995_one = pd.DataFrame(data)
  416. with(open("/root/r_base/HRS/1995/data/A95CS_R.DA", "r", encoding="utf-8") )as file:
  417. HHID_list = []
  418. PN_list = []
  419. SEX_list = []
  420. # 逐行读取文件
  421. for line in file:
  422. HHID = line[0:6]
  423. PN = line[6:9]
  424. # 1.Male
  425. # 2.Female
  426. SEX = line[36:37] if not line[36:37]=="0" else np.nan
  427. HHID_list.append(HHID)
  428. PN_list.append(PN)
  429. SEX_list.append(SEX)
  430. data = {
  431. "HHID":HHID_list,
  432. "PN":PN_list,
  433. "SEX":SEX_list,
  434. }
  435. result_1995_two = pd.DataFrame(data)
  436. result_1995 = pd.merge(result_1995_one, result_1995_two, on=["HHID", "PN"], how="left")
  437. with(open("/root/r_base/HRS/1995/data/A95B_R.DA", "r", encoding="utf-8") )as file:
  438. HHID_list = []
  439. PN_list = []
  440. SMOKED_list = []
  441. DRINKED_list = []
  442. PHYSICAL_ACTIVITY_LEVEL_list = []
  443. BMI_list = []
  444. HEART_PROBL_list = []
  445. STROKE_list=[]
  446. # 逐行读取文件
  447. for line in file:
  448. HHID = line[0:6]
  449. PN = line[6:9]
  450. # 1. YES
  451. # 2. Cigars
  452. # 5. NO
  453. # 7. Other
  454. # 8. DK (don't know); NA (not ascertained)
  455. # 9. RF (refused)
  456. # 合并后
  457. # 1. Yes
  458. # 5. No [Inap in V502-V505]
  459. SMOKED = "1" if line[153:154] == "1" or line[153:154] == "2" else "5" if line[153:154] == "5" else np.nan
  460. # 1. YES
  461. # 3. [VOL] NEVER HAVE USED ALCOHOL
  462. # 5. NO
  463. # 7. Other
  464. # 8. DK (don't know); NA (not ascertained)
  465. # 9. RF (refused)
  466. # 合并后
  467. # 1. Yes
  468. # 5. No [Inap in V502-V505]
  469. DRINKED = "1" if line[157:158] == "1" else "5" if line[157:158] == "5" or line[157:158] == "3" else np.nan
  470. # 3. vigorous (vigorous activity more than once a week)
  471. # 2. moderate (moderate activity more than once a week)
  472. # 1. inactive (the rest)
  473. # 重度活动
  474. VIGOROUS_PHYSICAL = np.nan
  475. # 轻度活动
  476. LIGHT_PHYSICAL = np.nan
  477. PHYSICAL_ACTIVITY_LEVEL = np.nan
  478. # 体重
  479. WEIGH= float(line[164:167])*0.45359237 if not float(line[164:167])>400 else np.nan
  480. # 身高
  481. HIGHT = line[168:169]
  482. if not line[168:169] == " " and not line[168:169] == "8" and not line[168:169] == "9" and not pd.isna(WEIGH):
  483. HIGHT = float(line[168:169])*0.3048 + float(line[169:171])*0.0254
  484. # BMI
  485. BMI = WEIGH / math.pow(HEIGHT,2)
  486. else :
  487. BMI = np.nan
  488. # 1. YES
  489. # 3. [VOL] DISPUTES W1 RECORD
  490. # 5. NO
  491. # 7. Other
  492. # 8. DK (don't know); NA (not ascertained)
  493. # 9. RF (refused)
  494. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  495. # 1. Yes
  496. # 3. [VOL] DISPUTES W1 RECORD
  497. # 5. No [Inap in V418-V421]
  498. HEART_PROBL = np.nan if line[63:64] == "8" else line[63:64]
  499. # STROKE
  500. # 1. Yes
  501. # 3. [VOL] DISPUTES W1 RECORD
  502. # 5. No [Inap in V418-V421]
  503. STROKE = np.nan if line[84:85] == "8" else "5" if line[84:85] == "2" or line[84:85] == "5" else line[84:85]
  504. HHID_list.append(HHID)
  505. PN_list.append(PN)
  506. SMOKED_list.append(SMOKED)
  507. DRINKED_list.append(DRINKED)
  508. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  509. BMI_list.append(BMI)
  510. HEART_PROBL_list.append(HEART_PROBL)
  511. STROKE_list.append(STROKE)
  512. data = {
  513. "HHID":HHID_list,
  514. "PN":PN_list,
  515. "SMOKED":SMOKED_list,
  516. "DRINKED":DRINKED_list,
  517. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  518. "BMI":BMI_list,
  519. "HEART_PROBL":HEART_PROBL_list,
  520. "STROKE":STROKE_list
  521. }
  522. result_1995_three = pd.DataFrame(data)
  523. result_1995 = pd.merge(result_1995, result_1995_three, on=["HHID", "PN"], how="left")
  524. result = pd.concat([result, result_1995], axis=0)
  525. # 获取1996数据
  526. with(open("/root/r_base/HRS/1996/data/H96A_R.DA", "r", encoding="utf-8") )as file:
  527. HHID_list = []
  528. PN_list = []
  529. BIRTH_YEAR_list = []
  530. MARITAL_STATUS_list = []
  531. EDUCATION_list = []
  532. # 逐行读取文件
  533. for line in file:
  534. HHID = line[0:6]
  535. PN = line[6:9]
  536. BIRTH_YEAR = line[25:29]
  537. # MARRIED, SPOUSE PRESENT........... 1
  538. # MARRIED, SPOUSE ABSENT............ 2
  539. # LIVING WITH SOMEONE............... 3 GO TO A11b
  540. # DIVORCED/SEPARATED................ 4 GO TO A11g
  541. # WIDOWED........................... 5 GO TO A11g
  542. # NEVER MARRIED..................... 6 GO TO B1
  543. # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION)
  544. MARITAL_STATUS = line[70:71]
  545. # 1 Married or Partner; 5 other
  546. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5" if MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  547. # 0 For no formal education
  548. # 1-11 .....Grades
  549. # 12 .......High school
  550. # 13-15 ....Some college
  551. # 16 .......College grad
  552. # 17 .......Post college (17+ years)
  553. # 97 .......Other
  554. # 98. DK (don't know); NA (not ascertained)
  555. # 99. RF (refused)
  556. EDUCATION = np.nan if line[43:45] == "97" or line[43:45] == "98" or line[43:45] == "99" else line[43:45]
  557. HHID_list.append(HHID)
  558. PN_list.append(PN)
  559. BIRTH_YEAR_list.append(BIRTH_YEAR)
  560. MARITAL_STATUS_list.append(MARITAL_STATUS)
  561. EDUCATION_list.append(EDUCATION)
  562. data = {
  563. "HHID":HHID_list,
  564. "PN":PN_list,
  565. "BIRTH_YEAR":BIRTH_YEAR_list,
  566. "MARITAL_STATUS":MARITAL_STATUS_list,
  567. "EDUCATION":EDUCATION_list,
  568. }
  569. data["WAVE"] = 1996
  570. result_1996_one = pd.DataFrame(data)
  571. with(open("/root/r_base/HRS/1996/data/H96CS_R.DA", "r", encoding="utf-8") )as file:
  572. HHID_list = []
  573. PN_list = []
  574. SEX_list = []
  575. # 逐行读取文件
  576. for line in file:
  577. HHID = line[0:6]
  578. PN = line[6:9]
  579. # 1.Male
  580. # 2.Female
  581. SEX = line[74:75]
  582. HHID_list.append(HHID)
  583. PN_list.append(PN)
  584. SEX_list.append(SEX)
  585. data = {
  586. "HHID":HHID_list,
  587. "PN":PN_list,
  588. "SEX":SEX_list,
  589. }
  590. result_1996_two = pd.DataFrame(data)
  591. result_1996 = pd.merge(result_1996_one, result_1996_two, on=["HHID", "PN"], how="left")
  592. with(open("/root/r_base/HRS/1996/data/H96B_R.DA", "r", encoding="utf-8") )as file:
  593. HHID_list = []
  594. PN_list = []
  595. SMOKED_list = []
  596. DRINKED_list = []
  597. PHYSICAL_ACTIVITY_LEVEL_list = []
  598. BMI_list = []
  599. HEART_PROBL_list = []
  600. STROKE_list=[]
  601. # 逐行读取文件
  602. for line in file:
  603. HHID = line[0:6]
  604. PN = line[6:9]
  605. # 1. YES
  606. # 2. Cigars
  607. # 3. PIPE (IF VOLUNTEERED)
  608. # 5. NO
  609. # 8. DK (don't know); NA (not ascertained)
  610. # 9. RF (refused)
  611. # 合并后
  612. # 1. Yes
  613. # 5. No [Inap in V502-V505]
  614. SMOKED = "1" if line[160:161] == "1" or line[160:161] == "2" or line[160:161] == "3" else "5" if line[160:161] == "5" else np.nan
  615. # 1. YES
  616. # 3. [VOL] NEVER HAVE USED ALCOHOL
  617. # 5. NO
  618. # 8. DK (don't know); NA (not ascertained)
  619. # 9. RF (refused)
  620. # 合并后
  621. # 1. Yes
  622. # 5. No [Inap in V502-V505]
  623. DRINKED = "1" if line[166:167] == "1" else "5" if line[166:167] == "5" or line[166:167] == "3" else np.nan
  624. # 3. vigorous (vigorous activity more than once a week)
  625. # 2. moderate (moderate activity more than once a week)
  626. # 1. inactive (the rest)
  627. # 重度活动
  628. VIGOROUS_PHYSICAL = np.nan
  629. # 轻度活动
  630. LIGHT_PHYSICAL = np.nan
  631. PHYSICAL_ACTIVITY_LEVEL = np.nan
  632. # 体重
  633. WEIGH= float(line[174:177])*0.45359237 if not line[174:177].strip() =="" and not float(line[174:177])>400 else np.nan
  634. # 身高
  635. HIGHT = line[178:180].strip()
  636. if not HIGHT == "" and not HIGHT == "98" and not HIGHT == "99" and not pd.isna(WEIGH):
  637. HIGHT = float(HIGHT)*0.3048 + float(line[180:182])*0.0254
  638. # BMI
  639. BMI = WEIGH / math.pow(HEIGHT,2)
  640. else :
  641. BMI = np.nan
  642. # 1. YES
  643. # 3. [VOL] DISPUTES W1 RECORD
  644. # 5. NO
  645. # 7. Other
  646. # 8. DK (don't know); NA (not ascertained)
  647. # 9. RF (refused)
  648. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  649. # 1. Yes
  650. # 3. [VOL] DISPUTES W1 RECORD
  651. # 5. No [Inap in V418-V421]
  652. HEART_PROBL = np.nan if line[66:67] == "9" or line[66:67] == "" else line[66:67]
  653. # STROKE
  654. # 1. Yes
  655. # 3. [VOL] DISPUTES W1 RECORD
  656. # 5. No [Inap in V418-V421]
  657. STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88]
  658. HHID_list.append(HHID)
  659. PN_list.append(PN)
  660. SMOKED_list.append(SMOKED)
  661. DRINKED_list.append(DRINKED)
  662. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  663. BMI_list.append(BMI)
  664. HEART_PROBL_list.append(HEART_PROBL)
  665. STROKE_list.append(STROKE)
  666. data = {
  667. "HHID":HHID_list,
  668. "PN":PN_list,
  669. "SMOKED":SMOKED_list,
  670. "DRINKED":DRINKED_list,
  671. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  672. "BMI":BMI_list,
  673. "HEART_PROBL":HEART_PROBL_list,
  674. "STROKE":STROKE_list
  675. }
  676. result_1996_three = pd.DataFrame(data)
  677. result_1996 = pd.merge(result_1996, result_1996_three, on=["HHID", "PN"], how="left")
  678. result = pd.concat([result, result_1996], axis=0)
  679. # 获取1998数据
  680. with(open("/root/r_base/HRS/1998/data/H98A_R.DA", "r", encoding="utf-8") )as file:
  681. HHID_list = []
  682. PN_list = []
  683. BIRTH_YEAR_list = []
  684. MARITAL_STATUS_list = []
  685. EDUCATION_list = []
  686. # 逐行读取文件
  687. for line in file:
  688. HHID = line[0:6]
  689. PN = line[6:9]
  690. BIRTH_YEAR = line[31:35] if not line[31:35] == "9998" else np.nan
  691. # 0. DK (don't know); NA (not ascertained); RF (refused)
  692. # 1. MARRIED
  693. # 3. SEPARATED
  694. # 4. DIVORCED
  695. # 5. WIDOWED
  696. # 6. NEVER MARRIED
  697. # 7. OTHER (SPECIFY)
  698. MARITAL_STATUS = line[150:151]
  699. # 1 Married or Partner; 5 other
  700. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  701. # 0 For no formal education
  702. # 1-11 .....Grades
  703. # 12 .......High school
  704. # 13-15 ....Some college
  705. # 16 .......College grad
  706. # 17 .......Post college (17+ years)
  707. # 97 .......Other
  708. # 98. DK (don't know); NA (not ascertained)
  709. # 99. RF (refused)
  710. EDUCATION = np.nan if line[61:62] == "97" or line[61:62] == "98" or line[61:62] == "99" else line[61:62]
  711. HHID_list.append(HHID)
  712. PN_list.append(PN)
  713. BIRTH_YEAR_list.append(BIRTH_YEAR)
  714. MARITAL_STATUS_list.append(MARITAL_STATUS)
  715. EDUCATION_list.append(EDUCATION)
  716. data = {
  717. "HHID":HHID_list,
  718. "PN":PN_list,
  719. "BIRTH_YEAR":BIRTH_YEAR_list,
  720. "MARITAL_STATUS":MARITAL_STATUS_list,
  721. "EDUCATION":EDUCATION_list,
  722. }
  723. data["WAVE"] = 1998
  724. result_1998_one = pd.DataFrame(data)
  725. with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
  726. HHID_list = []
  727. PN_list = []
  728. SEX_list = []
  729. # 逐行读取文件
  730. for line in file:
  731. HHID = line[0:6]
  732. PN = line[6:9]
  733. # 1.Male
  734. # 2.Female
  735. SEX = line[70:71]
  736. HHID_list.append(HHID)
  737. PN_list.append(PN)
  738. SEX_list.append(SEX)
  739. data = {
  740. "HHID":HHID_list,
  741. "PN":PN_list,
  742. "SEX":SEX_list,
  743. }
  744. result_1998_two = pd.DataFrame(data)
  745. result_1998 = pd.merge(result_1998_one, result_1998_two, on=["HHID", "PN"], how="left")
  746. with(open("/root/r_base/HRS/1998/data/H98B_R.DA", "r", encoding="utf-8") )as file:
  747. HHID_list = []
  748. PN_list = []
  749. SMOKED_list = []
  750. DRINKED_list = []
  751. PHYSICAL_ACTIVITY_LEVEL_list = []
  752. BMI_list = []
  753. HEART_PROBL_list = []
  754. STROKE_list=[]
  755. # 逐行读取文件
  756. for line in file:
  757. HHID = line[0:6]
  758. PN = line[6:9]
  759. # 1. YES
  760. # 5. NO
  761. SMOKED = line[157:158] if line[157:158] == "1" or line[157:158] == "5" else np.nan
  762. # 1. YES
  763. # 3. [VOL] NEVER HAVE USED ALCOHOL
  764. # 5. NO
  765. # 8. DK (don't know); NA (not ascertained)
  766. # 9. RF (refused)
  767. # 合并后
  768. # 1. Yes
  769. # 5. No [Inap in V502-V505]
  770. DRINKED = "1" if line[184:185] == "1" else "5" if line[184:185] == "5" or line[184:185] == "3" else np.nan
  771. # 3. vigorous (vigorous activity more than once a week)
  772. # 2. moderate (moderate activity more than once a week)
  773. # 1. inactive (the rest)
  774. # 重度活动
  775. VIGOROUS_PHYSICAL = np.nan
  776. # 轻度活动
  777. LIGHT_PHYSICAL = np.nan
  778. PHYSICAL_ACTIVITY_LEVEL = np.nan
  779. # 体重
  780. WEIGH= float(line[196:199])*0.45359237 if not float(line[196:199])>400 else np.nan
  781. # 身高
  782. HEIGHT = line[200:202].strip()
  783. if not HEIGHT == "" and not HEIGHT == "98" and not HEIGHT == "99" and not pd.isna(WEIGH):
  784. if not line[202:204] == "98":
  785. HEIGHT = float(HEIGHT)*0.3048 + float(line[202:204])*0.0254
  786. # BMI
  787. BMI = WEIGH / math.pow(HEIGHT,2)
  788. else:
  789. HEIGHT = float(line[200:202])*0.3048
  790. # BMI
  791. BMI = WEIGH / math.pow(HEIGHT,2)
  792. else :
  793. BMI = np.nan
  794. # 1. YES
  795. # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD
  796. # 5. NO
  797. # 6. PRELOAD ERROR: Condition reported at prior wave but said no to
  798. # new event
  799. # 8. DK (don't know); NA (not ascertained)
  800. # 9. RF (refused)
  801. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  802. # 1. Yes
  803. # 3. [VOL] DISPUTES W1 RECORD
  804. # 5. No [Inap in V418-V421]
  805. HEART_PROBL = np.nan if line[66:67] == "8" else line[66:67]
  806. # 1. YES
  807. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  808. # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD
  809. # 5. NO
  810. # 8. DK (don't know); NA (not ascertained)
  811. # 9. RF (refused)
  812. # STROKE
  813. # 1. Yes
  814. # 3. [VOL] DISPUTES W1 RECORD
  815. # 5. No [Inap in V418-V421]
  816. STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88]
  817. HHID_list.append(HHID)
  818. PN_list.append(PN)
  819. SMOKED_list.append(SMOKED)
  820. DRINKED_list.append(DRINKED)
  821. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  822. BMI_list.append(BMI)
  823. HEART_PROBL_list.append(HEART_PROBL)
  824. STROKE_list.append(STROKE)
  825. data = {
  826. "HHID":HHID_list,
  827. "PN":PN_list,
  828. "SMOKED":SMOKED_list,
  829. "DRINKED":DRINKED_list,
  830. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  831. "BMI":BMI_list,
  832. "HEART_PROBL":HEART_PROBL_list,
  833. "STROKE":STROKE_list
  834. }
  835. result_1998_three = pd.DataFrame(data)
  836. result_1998 = pd.merge(result_1998, result_1998_three, on=["HHID", "PN"], how="left")
  837. result = pd.concat([result, result_1998], axis=0)
  838. # 获取2000数据
  839. with(open("/root/r_base/HRS/2000/data/H00A_R.DA", "r", encoding="utf-8") )as file:
  840. HHID_list = []
  841. PN_list = []
  842. BIRTH_YEAR_list = []
  843. MARITAL_STATUS_list = []
  844. EDUCATION_list = []
  845. # 逐行读取文件
  846. for line in file:
  847. HHID = line[0:6]
  848. PN = line[6:9]
  849. BIRTH_YEAR = line[31:35]
  850. # 0. DK (don't know); NA (not ascertained); RF (refused)
  851. # 1. MARRIED
  852. # 3. SEPARATED
  853. # 4. DIVORCED
  854. # 5. WIDOWED
  855. # 6. NEVER MARRIED
  856. # 7. OTHER (SPECIFY)
  857. MARITAL_STATUS = line[152:153]
  858. # 1 Married or Partner; 5 other
  859. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  860. # 0 For no formal education
  861. # 1-11 .....Grades
  862. # 12 .......High school
  863. # 13-15 ....Some college
  864. # 16 .......College grad
  865. # 17 .......Post college (17+ years)
  866. # 97 .......Other
  867. # 98. DK (don't know); NA (not ascertained)
  868. # 99. RF (refused)
  869. EDUCATION = np.nan if line[63:65] == "97" or line[63:65] == "98" or line[63:65] == "99" else line[63:65]
  870. HHID_list.append(HHID)
  871. PN_list.append(PN)
  872. BIRTH_YEAR_list.append(BIRTH_YEAR)
  873. MARITAL_STATUS_list.append(MARITAL_STATUS)
  874. EDUCATION_list.append(EDUCATION)
  875. data = {
  876. "HHID":HHID_list,
  877. "PN":PN_list,
  878. "BIRTH_YEAR":BIRTH_YEAR_list,
  879. "MARITAL_STATUS":MARITAL_STATUS_list,
  880. "EDUCATION":EDUCATION_list,
  881. }
  882. data["WAVE"] = 2000
  883. result_2000_one = pd.DataFrame(data)
  884. with(open("/root/r_base/HRS/2000/data/H00CS_R.DA", "r", encoding="utf-8") )as file:
  885. HHID_list = []
  886. PN_list = []
  887. SEX_list = []
  888. # 逐行读取文件
  889. for line in file:
  890. HHID = line[0:6]
  891. PN = line[6:9]
  892. # 1.Male
  893. # 2.Female
  894. SEX = line[79:80]
  895. HHID_list.append(HHID)
  896. PN_list.append(PN)
  897. SEX_list.append(SEX)
  898. data = {
  899. "HHID":HHID_list,
  900. "PN":PN_list,
  901. "SEX":SEX_list,
  902. }
  903. result_2000_two = pd.DataFrame(data)
  904. result_2000 = pd.merge(result_2000_one, result_2000_two, on=["HHID", "PN"], how="left")
  905. with(open("/root/r_base/HRS/2000/data/H00B_R.DA", "r", encoding="utf-8") )as file:
  906. HHID_list = []
  907. PN_list = []
  908. SMOKED_list = []
  909. DRINKED_list = []
  910. PHYSICAL_ACTIVITY_LEVEL_list = []
  911. BMI_list = []
  912. HEART_PROBL_list = []
  913. STROKE_list=[]
  914. # 逐行读取文件
  915. for line in file:
  916. HHID = line[0:6]
  917. PN = line[6:9]
  918. # 1. YES
  919. # 5. NO
  920. SMOKED = line[154:155] if line[154:155] == "1" or line[154:155] == "5" else np.nan
  921. # 1. YES
  922. # 3. [VOL] NEVER HAVE USED ALCOHOL
  923. # 5. NO
  924. # 8. DK (don't know); NA (not ascertained)
  925. # 9. RF (refused)
  926. # 合并后
  927. # 1. Yes
  928. # 5. No [Inap in V502-V505]
  929. DRINKED = "1" if line[181:182] == "1" else "5" if line[181:182] == "5" or line[181:182] == "3" else np.nan
  930. # 3. vigorous (vigorous activity more than once a week)
  931. # 2. moderate (moderate activity more than once a week)
  932. # 1. inactive (the rest)
  933. # 重度活动
  934. VIGOROUS_PHYSICAL = np.nan
  935. # 轻度活动
  936. LIGHT_PHYSICAL = np.nan
  937. PHYSICAL_ACTIVITY_LEVEL = np.nan
  938. # 体重
  939. WEIGH= float(line[193:196])*0.45359237 if not float(line[193:196])>400 else np.nan
  940. # 身高
  941. HEIGHT = line[197:198].strip()
  942. if not HEIGHT == "" and not HEIGHT == "9" and not pd.isna(WEIGH):
  943. HEIGHT = float(HEIGHT)*0.3048 + float(line[198:200])*0.0254
  944. # BMI
  945. BMI = WEIGH / math.pow(HEIGHT,2)
  946. else :
  947. BMI = np.nan
  948. # 1. YES
  949. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  950. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  951. # 5. NO
  952. # 8. DK (don't know); NA (not ascertained)
  953. # 9. RF (refused)
  954. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  955. # 1. Yes
  956. # 5. No [Inap in V418-V421]
  957. HEART_PROBL = np.nan if line[65:66] == "8" or line[65:66] == "9" else "1" if line[65:66] == "1" or line[65:66] == "3" else "5"
  958. # 1. YES
  959. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  960. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  961. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  962. # 5. NO
  963. # 8. DK (don't know); NA (not ascertained)
  964. # 9. RF (refused)
  965. # STROKE
  966. # 1. Yes
  967. # 5. No [Inap in V418-V421]
  968. STROKE = np.nan if line[86:87] == "8" or line[86:87] == "9" else "5" if line[86:87] == "2" or line[86:87] == "4" or line[86:87] == "5" else "1"
  969. HHID_list.append(HHID)
  970. PN_list.append(PN)
  971. SMOKED_list.append(SMOKED)
  972. DRINKED_list.append(DRINKED)
  973. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  974. BMI_list.append(BMI)
  975. HEART_PROBL_list.append(HEART_PROBL)
  976. STROKE_list.append(STROKE)
  977. data = {
  978. "HHID":HHID_list,
  979. "PN":PN_list,
  980. "SMOKED":SMOKED_list,
  981. "DRINKED":DRINKED_list,
  982. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
  983. "BMI":BMI_list,
  984. "HEART_PROBL":HEART_PROBL_list,
  985. "STROKE":STROKE_list
  986. }
  987. result_2000_three = pd.DataFrame(data)
  988. result_2000 = pd.merge(result_2000, result_2000_three, on=["HHID", "PN"], how="left")
  989. result = pd.concat([result, result_2000], axis=0)
  990. # 获取2002数据
  991. with(open("/root/r_base/HRS/2002/data/H02PR_R.da", "r", encoding="utf-8") )as file:
  992. HHID_list = []
  993. PN_list = []
  994. BIRTH_YEAR_list = []
  995. SEX_list = []
  996. # 逐行读取文件
  997. for line in file:
  998. HHID = line[0:6]
  999. PN = line[6:9]
  1000. BIRTH_YEAR = line[36:40]
  1001. # 1.Male
  1002. # 2.Female
  1003. SEX = line[19:20]
  1004. HHID_list.append(HHID)
  1005. PN_list.append(PN)
  1006. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1007. SEX_list.append(SEX)
  1008. data = {
  1009. "HHID":HHID_list,
  1010. "PN":PN_list,
  1011. "BIRTH_YEAR":BIRTH_YEAR_list,
  1012. "SEX":SEX_list,
  1013. }
  1014. data["WAVE"] = 2002
  1015. result_2002_one = pd.DataFrame(data)
  1016. with(open("/root/r_base/HRS/2002/data/H02C_R.da", "r", encoding="utf-8") )as file:
  1017. HHID_list = []
  1018. PN_list = []
  1019. SMOKED_list = []
  1020. DRINKED_list = []
  1021. BMI_list = []
  1022. HEART_PROBL_list = []
  1023. STROKE_list=[]
  1024. # 逐行读取文件
  1025. for line in file:
  1026. HHID = line[0:6]
  1027. PN = line[6:9]
  1028. # 1. YES
  1029. # 5. NO
  1030. SMOKED = line[155:156] if line[155:156] == "1" or line[155:156] == "5" else np.nan
  1031. # 1. YES
  1032. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1033. # 5. NO
  1034. # 8. DK (don't know); NA (not ascertained)
  1035. # 9. RF (refused)
  1036. # 合并后
  1037. # 1. Yes
  1038. # 5. No [Inap in V502-V505]
  1039. DRINKED = "1" if line[179:180] == "1" else "5" if line[179:180] == "5" or line[179:180] == "3" else np.nan
  1040. # 体重
  1041. WEIGH= float(line[190:193])*0.45359237 if not float(line[190:193])>400 else np.nan
  1042. # 身高
  1043. HEIGHT = line[194:195].strip()
  1044. if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH):
  1045. if not line[195:197] == "98" and not line[195:197]=="":
  1046. HEIGHT = float(HEIGHT)*0.3048 + float(line[195:197])*0.0254
  1047. # BMI
  1048. BMI = WEIGH / math.pow(HEIGHT,2)
  1049. else:
  1050. HEIGHT = float(line[194:195])*0.3048
  1051. # BMI
  1052. BMI = WEIGH / math.pow(HEIGHT,2)
  1053. else :
  1054. BMI = np.nan
  1055. # 1. YES
  1056. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1057. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1058. # 5. NO
  1059. # 8. DK (don't know); NA (not ascertained)
  1060. # 9. RF (refused)
  1061. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1062. # 1. Yes
  1063. # 5. No [Inap in V418-V421]
  1064. HEART_PROBL = np.nan if line[59:60] == "8" or line[59:60] == "9" else "1" if line[59:60] == "1" or line[59:60] == "3" else "5"
  1065. # 1. YES
  1066. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1067. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1068. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1069. # 5. NO
  1070. # 8. DK (don't know); NA (not ascertained)
  1071. # 9. RF (refused)
  1072. # STROKE
  1073. # 1. Yes
  1074. # 5. No [Inap in V418-V421]
  1075. STROKE = np.nan if line[80:81] == "8" or line[80:81] == "9" else "5" if line[80:81] == "2" or line[80:81] == "4" or line[80:81] == "5" else "1"
  1076. HHID_list.append(HHID)
  1077. PN_list.append(PN)
  1078. SMOKED_list.append(SMOKED)
  1079. DRINKED_list.append(DRINKED)
  1080. BMI_list.append(BMI)
  1081. HEART_PROBL_list.append(HEART_PROBL)
  1082. STROKE_list.append(STROKE)
  1083. data = {
  1084. "HHID":HHID_list,
  1085. "PN":PN_list,
  1086. "SMOKED":SMOKED_list,
  1087. "DRINKED":DRINKED_list,
  1088. "BMI":BMI_list,
  1089. "HEART_PROBL":HEART_PROBL_list,
  1090. "STROKE":STROKE_list
  1091. }
  1092. result_2002_two = pd.DataFrame(data)
  1093. result_2002 = pd.merge(result_2002_one, result_2002_two, on=["HHID", "PN"], how="left")
  1094. with(open("/root/r_base/HRS/2002/data/H02V_R.da", "r", encoding="utf-8") )as file:
  1095. HHID_list = []
  1096. PN_list = []
  1097. PHYSICAL_ACTIVITY_LEVEL_list = []
  1098. # 逐行读取文件
  1099. for line in file:
  1100. HHID = line[0:6]
  1101. PN = line[6:9]
  1102. # 1. MORE THAN ONCE A WEEK
  1103. # 2. ONCE A WEEK
  1104. # 3. ONCE TO THREE TIMES A MONTH
  1105. # 4. HARDLY EVER OR NEVER
  1106. # 8. DK (Don't Know)
  1107. # 9. RF (Refused)
  1108. # 3. vigorous (vigorous activity more than once a week)
  1109. # 2. moderate (moderate activity more than once a week)
  1110. # 1. inactive (the rest)
  1111. # 重度活动
  1112. VIGOROUS_PHYSICAL = "3" if line[151:152] == "1" or line[151:152] == "2" else "1" if line[151:152] == "3" or line[151:152] == "4" else np.nan
  1113. #中度活动
  1114. MODERATE_PHYSICAL = "2" if line[152:153] == "1" or line[152:153] == "2" else "1" if line[152:153] == "3" or line[152:153] == "4" else np.nan
  1115. # 轻度活动
  1116. LIGHT_PHYSICAL = np.nan
  1117. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1118. HHID_list.append(HHID)
  1119. PN_list.append(PN)
  1120. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1121. data = {
  1122. "HHID":HHID_list,
  1123. "PN":PN_list,
  1124. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  1125. }
  1126. result_2002_three = pd.DataFrame(data)
  1127. result_2002 = pd.merge(result_2002, result_2002_three, on=["HHID", "PN"], how="left")
  1128. with(open("/root/r_base/HRS/2002/data/H02B_R.da", "r", encoding="utf-8") )as file:
  1129. HHID_list = []
  1130. PN_list = []
  1131. MARITAL_STATUS_list = []
  1132. EDUCATION_list = []
  1133. # 逐行读取文件
  1134. for line in file:
  1135. HHID = line[0:6]
  1136. PN = line[6:9]
  1137. # 1. Married
  1138. # 3. Separated
  1139. # 4. Divorced
  1140. # 5. Widowed
  1141. # 6. Never Married
  1142. # 7. Other (Specify)
  1143. # 8. DK (Don't Know)
  1144. # 9. RF (Refused)
  1145. MARITAL_STATUS = line[131:132]
  1146. # 1 Married or Partner; 5 other
  1147. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  1148. # 0 For no formal education
  1149. # 1-11 .....Grades
  1150. # 12 .......High school
  1151. # 13-15 ....Some college
  1152. # 16 .......College grad
  1153. # 17 .......Post college (17+ years)
  1154. # 97 .......Other
  1155. # 98. DK (don't know); NA (not ascertained)
  1156. # 99. RF (refused)
  1157. EDUCATION = np.nan if line[42:44] == "97" or line[42:44] == "98" or line[42:44] == "99" else line[42:44]
  1158. HHID_list.append(HHID)
  1159. PN_list.append(PN)
  1160. MARITAL_STATUS_list.append(MARITAL_STATUS)
  1161. EDUCATION_list.append(EDUCATION)
  1162. data = {
  1163. "HHID":HHID_list,
  1164. "PN":PN_list,
  1165. "MARITAL_STATUS":MARITAL_STATUS_list,
  1166. "EDUCATION":EDUCATION_list
  1167. }
  1168. result_2002_four = pd.DataFrame(data)
  1169. result_2002 = pd.merge(result_2002, result_2002_four, on=["HHID", "PN"], how="left")
  1170. result = pd.concat([result, result_2002], axis=0)
  1171. # 获取2004数据
  1172. with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
  1173. HHID_list = []
  1174. PN_list = []
  1175. BIRTH_YEAR_list = []
  1176. SEX_list = []
  1177. # 逐行读取文件
  1178. for line in file:
  1179. HHID = line[0:6]
  1180. PN = line[6:9]
  1181. BIRTH_YEAR = line[25:29]
  1182. # 1.Male
  1183. # 2.Female
  1184. SEX = line[20:21]
  1185. HHID_list.append(HHID)
  1186. PN_list.append(PN)
  1187. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1188. SEX_list.append(SEX)
  1189. data = {
  1190. "HHID":HHID_list,
  1191. "PN":PN_list,
  1192. "BIRTH_YEAR":BIRTH_YEAR_list,
  1193. "SEX":SEX_list,
  1194. }
  1195. data["WAVE"] = 2004
  1196. result_2004_one = pd.DataFrame(data)
  1197. with(open("/root/r_base/HRS/2004/data/H04C_R.da", "r", encoding="utf-8") )as file:
  1198. HHID_list = []
  1199. PN_list = []
  1200. SMOKED_list = []
  1201. DRINKED_list = []
  1202. BMI_list = []
  1203. HEART_PROBL_list = []
  1204. PHYSICAL_ACTIVITY_LEVEL_list = []
  1205. STROKE_list=[]
  1206. # 逐行读取文件
  1207. for line in file:
  1208. HHID = line[0:6]
  1209. PN = line[6:9]
  1210. # 1. YES
  1211. # 5. NO
  1212. SMOKED = line[167:168] if line[167:168] == "1" or line[167:168] == "5" else np.nan
  1213. # 1. YES
  1214. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1215. # 5. NO
  1216. # 8. DK (don't know); NA (not ascertained)
  1217. # 9. RF (refused)
  1218. # 合并后
  1219. # 1. Yes
  1220. # 5. No [Inap in V502-V505]
  1221. DRINKED = "1" if line[192:193] == "1" else "5" if line[192:193] == "5" or line[192:193] == "3" else np.nan
  1222. # 3. vigorous (vigorous activity more than once a week)
  1223. # 2. moderate (moderate activity more than once a week)
  1224. # 1. inactive (the rest)
  1225. # 体重
  1226. WEIGH= float(line[203:206])*0.45359237 if not float(line[203:206])>400 else np.nan
  1227. # 身高
  1228. HEIGHT = line[222:223].strip()
  1229. if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH):
  1230. if not line[223:225] == "98" and not line[223:225]=="":
  1231. HEIGHT = float(HEIGHT)*0.3048 + float(line[223:225])*0.0254
  1232. # BMI
  1233. BMI = WEIGH / math.pow(HEIGHT,2)
  1234. else:
  1235. HEIGHT = float(line[222:223])*0.3048
  1236. # BMI
  1237. BMI = WEIGH / math.pow(HEIGHT,2)
  1238. else :
  1239. BMI = np.nan
  1240. # 1. YES
  1241. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1242. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1243. # 5. NO
  1244. # 8. DK (don't know); NA (not ascertained)
  1245. # 9. RF (refused)
  1246. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1247. # 1. Yes
  1248. # 5. No [Inap in V418-V421]
  1249. HEART_PROBL = np.nan if line[69:70] == "8" or line[69:70] == "9" else "1" if line[69:70] == "1" or line[69:70] == "3" else "5"
  1250. # 1. YES
  1251. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1252. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1253. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1254. # 5. NO
  1255. # 8. DK (don't know); NA (not ascertained)
  1256. # 9. RF (refused)
  1257. # STROKE
  1258. # 1. Yes
  1259. # 5. No [Inap in V418-V421]
  1260. STROKE = np.nan if line[90:91] == "8" or line[90:91] == "9" else "5" if line[90:91] == "2" or line[90:91] == "4" or line[90:91] == "5" else "1"
  1261. # 1. MORE THAN ONCE A WEEK
  1262. # 2. ONCE A WEEK
  1263. # 3. ONCE TO THREE TIMES A MONTH
  1264. # 4. HARDLY EVER OR NEVER
  1265. # 7. (VOL) EVERY DAY
  1266. # 8. DK (Don't Know)
  1267. # 9. RF (Refused)
  1268. # 3. vigorous (vigorous activity more than once a week)
  1269. # 2. moderate (moderate activity more than once a week)
  1270. # 1. inactive (the rest)
  1271. # 重度活动
  1272. VIGOROUS_PHYSICAL = "3" if line[164:165] == "1" or line[164:165] == "2" or line[164:165] == "7" else "1" if line[164:165] == "3" or line[164:165] == "4" else np.nan
  1273. #中度活动
  1274. MODERATE_PHYSICAL = "2" if line[165:166] == "1" or line[165:166] == "2" or line[165:166] == "7" else "1" if line[165:166] == "3" or line[165:166] == "4" else np.nan
  1275. # 轻度活动
  1276. LIGHT_PHYSICAL = np.nan
  1277. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1278. HHID_list.append(HHID)
  1279. PN_list.append(PN)
  1280. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1281. SMOKED_list.append(SMOKED)
  1282. DRINKED_list.append(DRINKED)
  1283. BMI_list.append(BMI)
  1284. HEART_PROBL_list.append(HEART_PROBL)
  1285. STROKE_list.append(STROKE)
  1286. data = {
  1287. "HHID":HHID_list,
  1288. "PN":PN_list,
  1289. "SMOKED":SMOKED_list,
  1290. "DRINKED":DRINKED_list,
  1291. "BMI":BMI_list,
  1292. "HEART_PROBL":HEART_PROBL_list,
  1293. "STROKE":STROKE_list,
  1294. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  1295. }
  1296. result_2004_two = pd.DataFrame(data)
  1297. result_2004 = pd.merge(result_2004_one, result_2004_two, on=["HHID", "PN"], how="left")
  1298. with(open("/root/r_base/HRS/2004/data/H04B_R.da", "r", encoding="utf-8") )as file:
  1299. HHID_list = []
  1300. PN_list = []
  1301. MARITAL_STATUS_list = []
  1302. EDUCATION_list = []
  1303. # 逐行读取文件
  1304. for line in file:
  1305. HHID = line[0:6]
  1306. PN = line[6:9]
  1307. # 1. MARRIED (VOL)
  1308. # 2. ANULLED (VOL)
  1309. # 3. SEPARATED
  1310. # 4. DIVORCED
  1311. # 5. WIDOWED
  1312. # 6. NEVER MARRIED
  1313. # 7. OTHER (SPECIFY)
  1314. MARITAL_STATUS = line[161:162]
  1315. # 1 Married or Partner; 5 other
  1316. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  1317. # 0 For no formal education
  1318. # 1-11 .....Grades
  1319. # 12 .......High school
  1320. # 13-15 ....Some college
  1321. # 16 .......College grad
  1322. # 17 .......Post college (17+ years)
  1323. # 97 .......Other
  1324. # 98. DK (don't know); NA (not ascertained)
  1325. # 99. RF (refused)
  1326. EDUCATION = np.nan if line[41:43] == "97" or line[41:43] == "98" or line[41:43] == "99" else line[41:43]
  1327. HHID_list.append(HHID)
  1328. PN_list.append(PN)
  1329. MARITAL_STATUS_list.append(MARITAL_STATUS)
  1330. EDUCATION_list.append(EDUCATION)
  1331. data = {
  1332. "HHID":HHID_list,
  1333. "PN":PN_list,
  1334. "MARITAL_STATUS":MARITAL_STATUS_list,
  1335. "EDUCATION":EDUCATION_list
  1336. }
  1337. result_2004_four = pd.DataFrame(data)
  1338. result_2004 = pd.merge(result_2004, result_2004_four, on=["HHID", "PN"], how="left")
  1339. result = pd.concat([result, result_2004], axis=0)
  1340. # 获取2006数据
  1341. with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
  1342. HHID_list = []
  1343. PN_list = []
  1344. BIRTH_YEAR_list = []
  1345. SEX_list = []
  1346. # 逐行读取文件
  1347. for line in file:
  1348. HHID = line[0:6]
  1349. PN = line[6:9]
  1350. BIRTH_YEAR = line[25:29]
  1351. # 1.Male
  1352. # 2.Female
  1353. SEX = line[20:21]
  1354. HHID_list.append(HHID)
  1355. PN_list.append(PN)
  1356. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1357. SEX_list.append(SEX)
  1358. data = {
  1359. "HHID":HHID_list,
  1360. "PN":PN_list,
  1361. "BIRTH_YEAR":BIRTH_YEAR_list,
  1362. "SEX":SEX_list,
  1363. }
  1364. data["WAVE"] = 2006
  1365. result_2006_one = pd.DataFrame(data)
  1366. with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
  1367. HHID_list = []
  1368. PN_list = []
  1369. SMOKED_list = []
  1370. DRINKED_list = []
  1371. BMI_list = []
  1372. HEART_PROBL_list = []
  1373. PHYSICAL_ACTIVITY_LEVEL_list = []
  1374. STROKE_list=[]
  1375. # 逐行读取文件
  1376. for line in file:
  1377. HHID = line[0:6]
  1378. PN = line[6:9]
  1379. # 1. YES
  1380. # 5. NO
  1381. SMOKED = line[181:182]
  1382. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  1383. # 1. YES
  1384. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1385. # 5. NO
  1386. # 8. DK (don't know); NA (not ascertained)
  1387. # 9. RF (refused)
  1388. # 合并后
  1389. # 1. Yes
  1390. # 5. No [Inap in V502-V505]
  1391. DRINKED = line[207:208]
  1392. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  1393. # 3. vigorous (vigorous activity more than once a week)
  1394. # 2. moderate (moderate activity more than once a week)
  1395. # 1. inactive (the rest)
  1396. # 体重
  1397. WEIGH = line[218:221].strip()
  1398. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
  1399. # 身高
  1400. HEIGHT_FEET = line[252:253]
  1401. HEIGHT_INCHES = line[253:255]
  1402. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  1403. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
  1404. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  1405. # BMI
  1406. BMI = WEIGH / math.pow(HEIGHT,2)
  1407. else:
  1408. HEIGHT = float(HEIGHT_FEET)*0.3048
  1409. # BMI
  1410. BMI = WEIGH / math.pow(HEIGHT,2)
  1411. else :
  1412. BMI = np.nan
  1413. # 1. YES
  1414. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1415. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1416. # 5. NO
  1417. # 8. DK (don't know); NA (not ascertained)
  1418. # 9. RF (refused)
  1419. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1420. # 1. Yes
  1421. # 5. No [Inap in V418-V421]
  1422. HEART_PROBL = line[80:81]
  1423. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  1424. # 1. YES
  1425. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1426. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1427. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1428. # 5. NO
  1429. # 8. DK (don't know); NA (not ascertained)
  1430. # 9. RF (refused)
  1431. # STROKE
  1432. # 1. Yes
  1433. # 5. No [Inap in V418-V421]
  1434. STROKE = line[101:102]
  1435. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  1436. # 1. MORE THAN ONCE A WEEK
  1437. # 2. ONCE A WEEK
  1438. # 3. ONCE TO THREE TIMES A MONTH
  1439. # 4. HARDLY EVER OR NEVER
  1440. # 7. (VOL) EVERY DAY
  1441. # 8. DK (Don't Know)
  1442. # 9. RF (Refused)
  1443. # 3. vigorous (vigorous activity more than once a week)
  1444. # 2. moderate (moderate activity more than once a week)
  1445. # 1. inactive (the rest)
  1446. # 重度活动
  1447. VIGOROUS_PHYSICAL = line[178:179]
  1448. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  1449. #中度活动
  1450. MODERATE_PHYSICAL = line[179:180]
  1451. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  1452. # 轻度活动
  1453. LIGHT_PHYSICAL = np.nan
  1454. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1455. HHID_list.append(HHID)
  1456. PN_list.append(PN)
  1457. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1458. SMOKED_list.append(SMOKED)
  1459. DRINKED_list.append(DRINKED)
  1460. BMI_list.append(BMI)
  1461. HEART_PROBL_list.append(HEART_PROBL)
  1462. STROKE_list.append(STROKE)
  1463. data = {
  1464. "HHID":HHID_list,
  1465. "PN":PN_list,
  1466. "SMOKED":SMOKED_list,
  1467. "DRINKED":DRINKED_list,
  1468. "BMI":BMI_list,
  1469. "HEART_PROBL":HEART_PROBL_list,
  1470. "STROKE":STROKE_list,
  1471. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  1472. }
  1473. result_2006_two = pd.DataFrame(data)
  1474. result_2006 = pd.merge(result_2006_one, result_2006_two, on=["HHID", "PN"], how="left")
  1475. with(open("/root/r_base/HRS/2006/data/H06B_R.da", "r", encoding="utf-8") )as file:
  1476. HHID_list = []
  1477. PN_list = []
  1478. MARITAL_STATUS_list = []
  1479. EDUCATION_list = []
  1480. # 逐行读取文件
  1481. for line in file:
  1482. HHID = line[0:6]
  1483. PN = line[6:9]
  1484. # 1. MARRIED (VOL)
  1485. # 2. ANULLED (VOL)
  1486. # 3. SEPARATED
  1487. # 4. DIVORCED
  1488. # 5. WIDOWED
  1489. # 6. NEVER MARRIED
  1490. # 7. OTHER (SPECIFY)
  1491. # 8. DK (Don't Know); NA (Not Ascertained)
  1492. # 9. RF (Refused)
  1493. MARITAL_STATUS = line[177:178]
  1494. # 1 Married or Partner; 5 other
  1495. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  1496. # 0 For no formal education
  1497. # 1-11 .....Grades
  1498. # 12 .......High school
  1499. # 13-15 ....Some college
  1500. # 16 .......College grad
  1501. # 17 .......Post college (17+ years)
  1502. # 97 .......Other
  1503. # 98. DK (don't know); NA (not ascertained)
  1504. # 99. RF (refused)
  1505. EDUCATION = line[47:49]
  1506. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" else EDUCATION
  1507. HHID_list.append(HHID)
  1508. PN_list.append(PN)
  1509. MARITAL_STATUS_list.append(MARITAL_STATUS)
  1510. EDUCATION_list.append(EDUCATION)
  1511. data = {
  1512. "HHID":HHID_list,
  1513. "PN":PN_list,
  1514. "MARITAL_STATUS":MARITAL_STATUS_list,
  1515. "EDUCATION":EDUCATION_list
  1516. }
  1517. result_2006_four = pd.DataFrame(data)
  1518. result_2006 = pd.merge(result_2006, result_2006_four, on=["HHID", "PN"], how="left")
  1519. result = pd.concat([result, result_2006], axis=0)
  1520. # 获取2008数据
  1521. with(open("/root/r_base/HRS/2008/data/H08PR_R.da", "r", encoding="utf-8") )as file:
  1522. HHID_list = []
  1523. PN_list = []
  1524. BIRTH_YEAR_list = []
  1525. SEX_list = []
  1526. # 逐行读取文件
  1527. for line in file:
  1528. HHID = line[0:6]
  1529. PN = line[6:9]
  1530. BIRTH_YEAR = line[25:29]
  1531. # 1.Male
  1532. # 2.Female
  1533. SEX = line[20:21]
  1534. HHID_list.append(HHID)
  1535. PN_list.append(PN)
  1536. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1537. SEX_list.append(SEX)
  1538. data = {
  1539. "HHID":HHID_list,
  1540. "PN":PN_list,
  1541. "BIRTH_YEAR":BIRTH_YEAR_list,
  1542. "SEX":SEX_list,
  1543. }
  1544. data["WAVE"] = 2008
  1545. result_2008_one = pd.DataFrame(data)
  1546. with(open("/root/r_base/HRS/2008/data/H08C_R.da", "r", encoding="utf-8") )as file:
  1547. HHID_list = []
  1548. PN_list = []
  1549. SMOKED_list = []
  1550. DRINKED_list = []
  1551. BMI_list = []
  1552. HEART_PROBL_list = []
  1553. PHYSICAL_ACTIVITY_LEVEL_list = []
  1554. STROKE_list=[]
  1555. # 逐行读取文件
  1556. for line in file:
  1557. HHID = line[0:6]
  1558. PN = line[6:9]
  1559. # 1. YES
  1560. # 5. NO
  1561. SMOKED = line[258:259]
  1562. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  1563. # 1. YES
  1564. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1565. # 5. NO
  1566. # 8. DK (don't know); NA (not ascertained)
  1567. # 9. RF (refused)
  1568. # 合并后
  1569. # 1. Yes
  1570. # 5. No [Inap in V502-V505]
  1571. DRINKED = line[284:285]
  1572. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  1573. # 3. vigorous (vigorous activity more than once a week)
  1574. # 2. moderate (moderate activity more than once a week)
  1575. # 1. inactive (the rest)
  1576. # 体重
  1577. WEIGH = line[295:298]
  1578. WEIGH= float(WEIGH)*0.45359237 if not float(WEIGH)>400 else np.nan
  1579. # 身高
  1580. HEIGHT_FEET = line[329:330].strip()
  1581. HEIGHT_INCHES = line[330:338].strip()
  1582. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  1583. if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="":
  1584. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  1585. # BMI
  1586. BMI = WEIGH / math.pow(HEIGHT,2)
  1587. else:
  1588. HEIGHT = float(HEIGHT_FEET)*0.3048
  1589. # BMI
  1590. BMI = WEIGH / math.pow(HEIGHT,2)
  1591. else :
  1592. BMI = np.nan
  1593. # 1. YES
  1594. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1595. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1596. # 5. NO
  1597. # 8. DK (don't know); NA (not ascertained)
  1598. # 9. RF (refused)
  1599. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1600. # 1. Yes
  1601. # 5. No [Inap in V418-V421]
  1602. HEART_PROBL = line[128:129]
  1603. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  1604. # 1. YES
  1605. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1606. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1607. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1608. # 5. NO
  1609. # 8. DK (don't know); NA (not ascertained)
  1610. # 9. RF (refused)
  1611. # STROKE
  1612. # 1. Yes
  1613. # 5. No [Inap in V418-V421]
  1614. STROKE = line[149:150]
  1615. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  1616. # 1. MORE THAN ONCE A WEEK
  1617. # 2. ONCE A WEEK
  1618. # 3. ONCE TO THREE TIMES A MONTH
  1619. # 4. HARDLY EVER OR NEVER
  1620. # 7. (VOL) EVERY DAY
  1621. # 8. DK (Don't Know)
  1622. # 9. RF (Refused)
  1623. # 3. vigorous (vigorous activity more than once a week)
  1624. # 2. moderate (moderate activity more than once a week)
  1625. # 1. inactive (the rest)
  1626. # 重度活动
  1627. VIGOROUS_PHYSICAL = line[255:256]
  1628. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  1629. #中度活动
  1630. MODERATE_PHYSICAL = line[256:257]
  1631. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  1632. # 轻度活动
  1633. LIGHT_PHYSICAL = np.nan
  1634. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1635. HHID_list.append(HHID)
  1636. PN_list.append(PN)
  1637. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1638. SMOKED_list.append(SMOKED)
  1639. DRINKED_list.append(DRINKED)
  1640. BMI_list.append(BMI)
  1641. HEART_PROBL_list.append(HEART_PROBL)
  1642. STROKE_list.append(STROKE)
  1643. data = {
  1644. "HHID":HHID_list,
  1645. "PN":PN_list,
  1646. "SMOKED":SMOKED_list,
  1647. "DRINKED":DRINKED_list,
  1648. "BMI":BMI_list,
  1649. "HEART_PROBL":HEART_PROBL_list,
  1650. "STROKE":STROKE_list,
  1651. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  1652. }
  1653. result_2008_two = pd.DataFrame(data)
  1654. result_2008 = pd.merge(result_2008_one, result_2008_two, on=["HHID", "PN"], how="left")
  1655. with(open("/root/r_base/HRS/2008/data/H08B_R.da", "r", encoding="utf-8") )as file:
  1656. HHID_list = []
  1657. PN_list = []
  1658. MARITAL_STATUS_list = []
  1659. EDUCATION_list = []
  1660. # 逐行读取文件
  1661. for line in file:
  1662. HHID = line[0:6]
  1663. PN = line[6:9]
  1664. # 1. MARRIED (VOL)
  1665. # 2. ANULLED (VOL)
  1666. # 3. SEPARATED
  1667. # 4. DIVORCED
  1668. # 5. WIDOWED
  1669. # 6. NEVER MARRIED
  1670. # 7. OTHER (SPECIFY)
  1671. # 8. DK (Don't Know); NA (Not Ascertained)
  1672. # 9. RF (Refused)
  1673. MARITAL_STATUS = line[311:312]
  1674. # 1 Married or Partner; 5 other
  1675. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  1676. # 0 For no formal education
  1677. # 1-11 .....Grades
  1678. # 12 .......High school
  1679. # 13-15 ....Some college
  1680. # 16 .......College grad
  1681. # 17 .......Post college (17+ years)
  1682. # 97 .......Other
  1683. # 98. DK (don't know); NA (not ascertained)
  1684. # 99. RF (refused)
  1685. EDUCATION = line[48:50]
  1686. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  1687. HHID_list.append(HHID)
  1688. PN_list.append(PN)
  1689. MARITAL_STATUS_list.append(MARITAL_STATUS)
  1690. EDUCATION_list.append(EDUCATION)
  1691. data = {
  1692. "HHID":HHID_list,
  1693. "PN":PN_list,
  1694. "MARITAL_STATUS":MARITAL_STATUS_list,
  1695. "EDUCATION":EDUCATION_list
  1696. }
  1697. result_2008_four = pd.DataFrame(data)
  1698. result_2008 = pd.merge(result_2008, result_2008_four, on=["HHID", "PN"], how="left")
  1699. result = pd.concat([result, result_2008], axis=0)
  1700. # 获取2010数据
  1701. with(open("/root/r_base/HRS/2010/data/H10PR_R.da", "r", encoding="utf-8") )as file:
  1702. HHID_list = []
  1703. PN_list = []
  1704. BIRTH_YEAR_list = []
  1705. SEX_list = []
  1706. # 逐行读取文件
  1707. for line in file:
  1708. HHID = line[0:6]
  1709. PN = line[6:9]
  1710. BIRTH_YEAR = line[25:29]
  1711. # 1.Male
  1712. # 2.Female
  1713. SEX = line[20:21]
  1714. HHID_list.append(HHID)
  1715. PN_list.append(PN)
  1716. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1717. SEX_list.append(SEX)
  1718. data = {
  1719. "HHID":HHID_list,
  1720. "PN":PN_list,
  1721. "BIRTH_YEAR":BIRTH_YEAR_list,
  1722. "SEX":SEX_list,
  1723. }
  1724. data["WAVE"] = 2010
  1725. result_2010_one = pd.DataFrame(data)
  1726. with(open("/root/r_base/HRS/2010/data/H10C_R.da", "r", encoding="utf-8") )as file:
  1727. HHID_list = []
  1728. PN_list = []
  1729. SMOKED_list = []
  1730. DRINKED_list = []
  1731. BMI_list = []
  1732. HEART_PROBL_list = []
  1733. PHYSICAL_ACTIVITY_LEVEL_list = []
  1734. STROKE_list=[]
  1735. # 逐行读取文件
  1736. for line in file:
  1737. HHID = line[0:6]
  1738. PN = line[6:9]
  1739. # 1. YES
  1740. # 5. NO
  1741. SMOKED = line[250:251]
  1742. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  1743. # 1. YES
  1744. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1745. # 5. NO
  1746. # 8. DK (don't know); NA (not ascertained)
  1747. # 9. RF (refused)
  1748. # 合并后
  1749. # 1. Yes
  1750. # 5. No [Inap in V502-V505]
  1751. DRINKED = line[276:277]
  1752. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  1753. # 3. vigorous (vigorous activity more than once a week)
  1754. # 2. moderate (moderate activity more than once a week)
  1755. # 1. inactive (the rest)
  1756. # 体重
  1757. WEIGH = line[287:290].strip()
  1758. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
  1759. # 身高
  1760. HEIGHT_FEET = line[304:305].strip()
  1761. HEIGHT_INCHES = line[305:313]
  1762. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  1763. if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="":
  1764. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  1765. # BMI
  1766. BMI = WEIGH / math.pow(HEIGHT,2)
  1767. else:
  1768. HEIGHT = float(HEIGHT_FEET)*0.3048
  1769. # BMI
  1770. BMI = WEIGH / math.pow(HEIGHT,2)
  1771. else :
  1772. BMI = np.nan
  1773. # 1. YES
  1774. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1775. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1776. # 5. NO
  1777. # 8. DK (don't know); NA (not ascertained)
  1778. # 9. RF (refused)
  1779. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1780. # 1. Yes
  1781. # 5. No [Inap in V418-V421]
  1782. HEART_PROBL = line[76:77]
  1783. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  1784. # 1. YES
  1785. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1786. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1787. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1788. # 5. NO
  1789. # 8. DK (don't know); NA (not ascertained)
  1790. # 9. RF (refused)
  1791. # STROKE
  1792. # 1. Yes
  1793. # 5. No [Inap in V418-V421]
  1794. STROKE = line[139:140]
  1795. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  1796. # 1. MORE THAN ONCE A WEEK
  1797. # 2. ONCE A WEEK
  1798. # 3. ONCE TO THREE TIMES A MONTH
  1799. # 4. HARDLY EVER OR NEVER
  1800. # 7. (VOL) EVERY DAY
  1801. # 8. DK (Don't Know)
  1802. # 9. RF (Refused)
  1803. # 3. vigorous (vigorous activity more than once a week)
  1804. # 2. moderate (moderate activity more than once a week)
  1805. # 1. inactive (the rest)
  1806. # 重度活动
  1807. VIGOROUS_PHYSICAL = line[247:248]
  1808. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  1809. #中度活动
  1810. MODERATE_PHYSICAL = line[248:249]
  1811. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  1812. # 轻度活动
  1813. LIGHT_PHYSICAL = np.nan
  1814. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1815. HHID_list.append(HHID)
  1816. PN_list.append(PN)
  1817. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1818. SMOKED_list.append(SMOKED)
  1819. DRINKED_list.append(DRINKED)
  1820. BMI_list.append(BMI)
  1821. HEART_PROBL_list.append(HEART_PROBL)
  1822. STROKE_list.append(STROKE)
  1823. data = {
  1824. "HHID":HHID_list,
  1825. "PN":PN_list,
  1826. "SMOKED":SMOKED_list,
  1827. "DRINKED":DRINKED_list,
  1828. "BMI":BMI_list,
  1829. "HEART_PROBL":HEART_PROBL_list,
  1830. "STROKE":STROKE_list,
  1831. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  1832. }
  1833. result_2010_two = pd.DataFrame(data)
  1834. result_2010 = pd.merge(result_2010_one, result_2010_two, on=["HHID", "PN"], how="left")
  1835. with(open("/root/r_base/HRS/2010/data/H10B_R.da", "r", encoding="utf-8") )as file:
  1836. HHID_list = []
  1837. PN_list = []
  1838. MARITAL_STATUS_list = []
  1839. EDUCATION_list = []
  1840. # 逐行读取文件
  1841. for line in file:
  1842. HHID = line[0:6]
  1843. PN = line[6:9]
  1844. # 1. MARRIED (VOL)
  1845. # 2. ANULLED (VOL)
  1846. # 3. SEPARATED
  1847. # 4. DIVORCED
  1848. # 5. WIDOWED
  1849. # 6. NEVER MARRIED
  1850. # 7. OTHER (SPECIFY)
  1851. # 8. DK (Don't Know); NA (Not Ascertained)
  1852. # 9. RF (Refused)
  1853. MARITAL_STATUS = line[305:306]
  1854. # 1 Married or Partner; 5 other
  1855. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  1856. # 0 For no formal education
  1857. # 1-11 .....Grades
  1858. # 12 .......High school
  1859. # 13-15 ....Some college
  1860. # 16 .......College grad
  1861. # 17 .......Post college (17+ years)
  1862. # 97 .......Other
  1863. # 98. DK (don't know); NA (not ascertained)
  1864. # 99. RF (refused)
  1865. EDUCATION = line[48:50]
  1866. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  1867. HHID_list.append(HHID)
  1868. PN_list.append(PN)
  1869. MARITAL_STATUS_list.append(MARITAL_STATUS)
  1870. EDUCATION_list.append(EDUCATION)
  1871. data = {
  1872. "HHID":HHID_list,
  1873. "PN":PN_list,
  1874. "MARITAL_STATUS":MARITAL_STATUS_list,
  1875. "EDUCATION":EDUCATION_list
  1876. }
  1877. result_2010_four = pd.DataFrame(data)
  1878. result_2010 = pd.merge(result_2010, result_2010_four, on=["HHID", "PN"], how="left")
  1879. result = pd.concat([result, result_2010], axis=0)
  1880. # 获取2012数据
  1881. with(open("/root/r_base/HRS/2012/data/H12PR_R.da", "r", encoding="utf-8") )as file:
  1882. HHID_list = []
  1883. PN_list = []
  1884. BIRTH_YEAR_list = []
  1885. SEX_list = []
  1886. # 逐行读取文件
  1887. for line in file:
  1888. HHID = line[0:6]
  1889. PN = line[6:9]
  1890. BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
  1891. # 1.Male
  1892. # 2.Female
  1893. SEX = line[20:21] if not line[20:21] == "" else np.nan
  1894. HHID_list.append(HHID)
  1895. PN_list.append(PN)
  1896. BIRTH_YEAR_list.append(BIRTH_YEAR)
  1897. SEX_list.append(SEX)
  1898. data = {
  1899. "HHID":HHID_list,
  1900. "PN":PN_list,
  1901. "BIRTH_YEAR":BIRTH_YEAR_list,
  1902. "SEX":SEX_list,
  1903. }
  1904. data["WAVE"] = 2012
  1905. result_2012_one = pd.DataFrame(data)
  1906. with(open("/root/r_base/HRS/2012/data/H12C_R.da", "r", encoding="utf-8") )as file:
  1907. HHID_list = []
  1908. PN_list = []
  1909. SMOKED_list = []
  1910. DRINKED_list = []
  1911. BMI_list = []
  1912. HEART_PROBL_list = []
  1913. PHYSICAL_ACTIVITY_LEVEL_list = []
  1914. STROKE_list=[]
  1915. # 逐行读取文件
  1916. for line in file:
  1917. HHID = line[0:6]
  1918. PN = line[6:9]
  1919. # 1. YES
  1920. # 5. NO
  1921. SMOKED = line[249:250]
  1922. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  1923. # 1. YES
  1924. # 3. [VOL] NEVER HAVE USED ALCOHOL
  1925. # 5. NO
  1926. # 8. DK (don't know); NA (not ascertained)
  1927. # 9. RF (refused)
  1928. # 合并后
  1929. # 1. Yes
  1930. # 5. No [Inap in V502-V505]
  1931. DRINKED = line[276:277]
  1932. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  1933. # 3. vigorous (vigorous activity more than once a week)
  1934. # 2. moderate (moderate activity more than once a week)
  1935. # 1. inactive (the rest)
  1936. # 体重
  1937. WEIGH = line[287:290].strip()
  1938. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
  1939. # 身高
  1940. HEIGHT_FEET = line[302:303].strip()
  1941. HEIGHT_INCHES = line[303:308]
  1942. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  1943. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
  1944. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  1945. # BMI
  1946. BMI = WEIGH / math.pow(HEIGHT,2)
  1947. else:
  1948. HEIGHT = float(HEIGHT_FEET)*0.3048
  1949. # BMI
  1950. BMI = WEIGH / math.pow(HEIGHT,2)
  1951. else :
  1952. BMI = np.nan
  1953. # 1. YES
  1954. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1955. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1956. # 5. NO
  1957. # 8. DK (don't know); NA (not ascertained)
  1958. # 9. RF (refused)
  1959. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  1960. # 1. Yes
  1961. # 5. No [Inap in V418-V421]
  1962. HEART_PROBL = line[82:83]
  1963. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  1964. # 1. YES
  1965. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  1966. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  1967. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  1968. # 5. NO
  1969. # 8. DK (don't know); NA (not ascertained)
  1970. # 9. RF (refused)
  1971. # STROKE
  1972. # 1. Yes
  1973. # 5. No [Inap in V418-V421]
  1974. STROKE = line[146:147]
  1975. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  1976. # 1. MORE THAN ONCE A WEEK
  1977. # 2. ONCE A WEEK
  1978. # 3. ONCE TO THREE TIMES A MONTH
  1979. # 4. HARDLY EVER OR NEVER
  1980. # 7. (VOL) EVERY DAY
  1981. # 8. DK (Don't Know)
  1982. # 9. RF (Refused)
  1983. # 3. vigorous (vigorous activity more than once a week)
  1984. # 2. moderate (moderate activity more than once a week)
  1985. # 1. inactive (the rest)
  1986. # 重度活动
  1987. VIGOROUS_PHYSICAL = line[246:247]
  1988. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  1989. #中度活动
  1990. MODERATE_PHYSICAL = line[247:248]
  1991. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  1992. # 轻度活动
  1993. LIGHT_PHYSICAL = np.nan
  1994. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  1995. HHID_list.append(HHID)
  1996. PN_list.append(PN)
  1997. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  1998. SMOKED_list.append(SMOKED)
  1999. DRINKED_list.append(DRINKED)
  2000. BMI_list.append(BMI)
  2001. HEART_PROBL_list.append(HEART_PROBL)
  2002. STROKE_list.append(STROKE)
  2003. data = {
  2004. "HHID":HHID_list,
  2005. "PN":PN_list,
  2006. "SMOKED":SMOKED_list,
  2007. "DRINKED":DRINKED_list,
  2008. "BMI":BMI_list,
  2009. "HEART_PROBL":HEART_PROBL_list,
  2010. "STROKE":STROKE_list,
  2011. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  2012. }
  2013. result_2012_two = pd.DataFrame(data)
  2014. result_2012 = pd.merge(result_2012_one, result_2012_two, on=["HHID", "PN"], how="left")
  2015. with(open("/root/r_base/HRS/2012/data/H12B_R.da", "r", encoding="utf-8") )as file:
  2016. HHID_list = []
  2017. PN_list = []
  2018. MARITAL_STATUS_list = []
  2019. EDUCATION_list = []
  2020. # 逐行读取文件
  2021. for line in file:
  2022. HHID = line[0:6]
  2023. PN = line[6:9]
  2024. # 1. MARRIED (VOL)
  2025. # 2. ANULLED (VOL)
  2026. # 3. SEPARATED
  2027. # 4. DIVORCED
  2028. # 5. WIDOWED
  2029. # 6. NEVER MARRIED
  2030. # 7. OTHER (SPECIFY)
  2031. # 8. DK (Don't Know); NA (Not Ascertained)
  2032. # 9. RF (Refused)
  2033. MARITAL_STATUS = line[294:295]
  2034. # 1 Married or Partner; 5 other
  2035. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  2036. # 0 For no formal education
  2037. # 1-11 .....Grades
  2038. # 12 .......High school
  2039. # 13-15 ....Some college
  2040. # 16 .......College grad
  2041. # 17 .......Post college (17+ years)
  2042. # 97 .......Other
  2043. # 98. DK (don't know); NA (not ascertained)
  2044. # 99. RF (refused)
  2045. EDUCATION = line[48:50]
  2046. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  2047. HHID_list.append(HHID)
  2048. PN_list.append(PN)
  2049. MARITAL_STATUS_list.append(MARITAL_STATUS)
  2050. EDUCATION_list.append(EDUCATION)
  2051. data = {
  2052. "HHID":HHID_list,
  2053. "PN":PN_list,
  2054. "MARITAL_STATUS":MARITAL_STATUS_list,
  2055. "EDUCATION":EDUCATION_list
  2056. }
  2057. result_2012_four = pd.DataFrame(data)
  2058. result_2012 = pd.merge(result_2012, result_2012_four, on=["HHID", "PN"], how="left")
  2059. result = pd.concat([result, result_2012], axis=0)
  2060. # 获取2014数据
  2061. with(open("/root/r_base/HRS/2014/data/H14PR_R.da", "r", encoding="utf-8") )as file:
  2062. HHID_list = []
  2063. PN_list = []
  2064. BIRTH_YEAR_list = []
  2065. SEX_list = []
  2066. # 逐行读取文件
  2067. for line in file:
  2068. HHID = line[0:6]
  2069. PN = line[6:9]
  2070. BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
  2071. # 1.Male
  2072. # 2.Female
  2073. SEX = line[20:21] if not line[20:21] == "" else np.nan
  2074. HHID_list.append(HHID)
  2075. PN_list.append(PN)
  2076. BIRTH_YEAR_list.append(BIRTH_YEAR)
  2077. SEX_list.append(SEX)
  2078. data = {
  2079. "HHID":HHID_list,
  2080. "PN":PN_list,
  2081. "BIRTH_YEAR":BIRTH_YEAR_list,
  2082. "SEX":SEX_list,
  2083. }
  2084. data["WAVE"] = 2014
  2085. result_2014_one = pd.DataFrame(data)
  2086. with(open("/root/r_base/HRS/2014/data/H14C_R.da", "r", encoding="utf-8") )as file:
  2087. HHID_list = []
  2088. PN_list = []
  2089. SMOKED_list = []
  2090. DRINKED_list = []
  2091. BMI_list = []
  2092. HEART_PROBL_list = []
  2093. PHYSICAL_ACTIVITY_LEVEL_list = []
  2094. STROKE_list=[]
  2095. # 逐行读取文件
  2096. for line in file:
  2097. HHID = line[0:6]
  2098. PN = line[6:9]
  2099. # 1. YES
  2100. # 5. NO
  2101. SMOKED = line[214:215]
  2102. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  2103. # 1. YES
  2104. # 3. [VOL] NEVER HAVE USED ALCOHOL
  2105. # 5. NO
  2106. # 8. DK (don't know); NA (not ascertained)
  2107. # 9. RF (refused)
  2108. # 合并后
  2109. # 1. Yes
  2110. # 5. No [Inap in V502-V505]
  2111. DRINKED = line[239:240]
  2112. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  2113. # 3. vigorous (vigorous activity more than once a week)
  2114. # 2. moderate (moderate activity more than once a week)
  2115. # 1. inactive (the rest)
  2116. # 体重
  2117. WEIGH = line[250:253].strip()
  2118. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
  2119. # 身高
  2120. HEIGHT_FEET = line[259:260].strip()
  2121. HEIGHT_INCHES = line[260:265]
  2122. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  2123. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
  2124. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  2125. # BMI
  2126. BMI = WEIGH / math.pow(HEIGHT,2)
  2127. else:
  2128. HEIGHT = float(HEIGHT_FEET)*0.3048
  2129. # BMI
  2130. BMI = WEIGH / math.pow(HEIGHT,2)
  2131. else :
  2132. BMI = np.nan
  2133. # 1. YES
  2134. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  2135. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  2136. # 5. NO
  2137. # 8. DK (don't know); NA (not ascertained)
  2138. # 9. RF (refused)
  2139. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  2140. # 1. Yes
  2141. # 5. No [Inap in V418-V421]
  2142. HEART_PROBL = line[66:67]
  2143. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  2144. # 1. YES
  2145. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  2146. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  2147. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  2148. # 5. NO
  2149. # 8. DK (don't know); NA (not ascertained)
  2150. # 9. RF (refused)
  2151. # STROKE
  2152. # 1. Yes
  2153. # 5. No [Inap in V418-V421]
  2154. STROKE = line[122:123]
  2155. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  2156. # 1. MORE THAN ONCE A WEEK
  2157. # 2. ONCE A WEEK
  2158. # 3. ONCE TO THREE TIMES A MONTH
  2159. # 4. HARDLY EVER OR NEVER
  2160. # 7. (VOL) EVERY DAY
  2161. # 8. DK (Don't Know)
  2162. # 9. RF (Refused)
  2163. # 3. vigorous (vigorous activity more than once a week)
  2164. # 2. moderate (moderate activity more than once a week)
  2165. # 1. inactive (the rest)
  2166. # 重度活动
  2167. VIGOROUS_PHYSICAL = line[211:212]
  2168. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  2169. #中度活动
  2170. MODERATE_PHYSICAL = line[212:213]
  2171. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  2172. # 轻度活动
  2173. LIGHT_PHYSICAL = np.nan
  2174. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  2175. HHID_list.append(HHID)
  2176. PN_list.append(PN)
  2177. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  2178. SMOKED_list.append(SMOKED)
  2179. DRINKED_list.append(DRINKED)
  2180. BMI_list.append(BMI)
  2181. HEART_PROBL_list.append(HEART_PROBL)
  2182. STROKE_list.append(STROKE)
  2183. data = {
  2184. "HHID":HHID_list,
  2185. "PN":PN_list,
  2186. "SMOKED":SMOKED_list,
  2187. "DRINKED":DRINKED_list,
  2188. "BMI":BMI_list,
  2189. "HEART_PROBL":HEART_PROBL_list,
  2190. "STROKE":STROKE_list,
  2191. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  2192. }
  2193. result_2014_two = pd.DataFrame(data)
  2194. result_2014 = pd.merge(result_2014_one, result_2014_two, on=["HHID", "PN"], how="left")
  2195. with(open("/root/r_base/HRS/2014/data/H14B_R.da", "r", encoding="utf-8") )as file:
  2196. HHID_list = []
  2197. PN_list = []
  2198. MARITAL_STATUS_list = []
  2199. EDUCATION_list = []
  2200. # 逐行读取文件
  2201. for line in file:
  2202. HHID = line[0:6]
  2203. PN = line[6:9]
  2204. # 1. MARRIED (VOL)
  2205. # 2. ANULLED (VOL)
  2206. # 3. SEPARATED
  2207. # 4. DIVORCED
  2208. # 5. WIDOWED
  2209. # 6. NEVER MARRIED
  2210. # 7. OTHER (SPECIFY)
  2211. # 8. DK (Don't Know); NA (Not Ascertained)
  2212. # 9. RF (Refused)
  2213. MARITAL_STATUS = line[274:275]
  2214. # 1 Married or Partner; 5 other
  2215. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  2216. # 0 For no formal education
  2217. # 1-11 .....Grades
  2218. # 12 .......High school
  2219. # 13-15 ....Some college
  2220. # 16 .......College grad
  2221. # 17 .......Post college (17+ years)
  2222. # 97 .......Other
  2223. # 98. DK (don't know); NA (not ascertained)
  2224. # 99. RF (refused)
  2225. EDUCATION = line[39:41]
  2226. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  2227. HHID_list.append(HHID)
  2228. PN_list.append(PN)
  2229. MARITAL_STATUS_list.append(MARITAL_STATUS)
  2230. EDUCATION_list.append(EDUCATION)
  2231. data = {
  2232. "HHID":HHID_list,
  2233. "PN":PN_list,
  2234. "MARITAL_STATUS":MARITAL_STATUS_list,
  2235. "EDUCATION":EDUCATION_list
  2236. }
  2237. result_2014_four = pd.DataFrame(data)
  2238. result_2014 = pd.merge(result_2014, result_2014_four, on=["HHID", "PN"], how="left")
  2239. result = pd.concat([result, result_2014], axis=0)
  2240. # 获取2016数据
  2241. with(open("/root/r_base/HRS/2016/data/H16PR_R.da", "r", encoding="utf-8") )as file:
  2242. HHID_list = []
  2243. PN_list = []
  2244. BIRTH_YEAR_list = []
  2245. SEX_list = []
  2246. # 逐行读取文件
  2247. for line in file:
  2248. HHID = line[0:6]
  2249. PN = line[6:9]
  2250. BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
  2251. # 1.Male
  2252. # 2.Female
  2253. SEX = line[20:21] if not line[20:21] == "" else np.nan
  2254. HHID_list.append(HHID)
  2255. PN_list.append(PN)
  2256. BIRTH_YEAR_list.append(BIRTH_YEAR)
  2257. SEX_list.append(SEX)
  2258. data = {
  2259. "HHID":HHID_list,
  2260. "PN":PN_list,
  2261. "BIRTH_YEAR":BIRTH_YEAR_list,
  2262. "SEX":SEX_list,
  2263. }
  2264. data["WAVE"] = 2016
  2265. result_2016_one = pd.DataFrame(data)
  2266. with(open("/root/r_base/HRS/2016/data/H16C_R.da", "r", encoding="utf-8") )as file:
  2267. HHID_list = []
  2268. PN_list = []
  2269. SMOKED_list = []
  2270. DRINKED_list = []
  2271. BMI_list = []
  2272. HEART_PROBL_list = []
  2273. PHYSICAL_ACTIVITY_LEVEL_list = []
  2274. STROKE_list=[]
  2275. # 逐行读取文件
  2276. for line in file:
  2277. HHID = line[0:6]
  2278. PN = line[6:9]
  2279. # 1. YES
  2280. # 5. NO
  2281. SMOKED = line[237:238]
  2282. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  2283. # 1. YES
  2284. # 3. [VOL] NEVER HAVE USED ALCOHOL
  2285. # 5. NO
  2286. # 8. DK (don't know); NA (not ascertained)
  2287. # 9. RF (refused)
  2288. # 合并后
  2289. # 1. Yes
  2290. # 5. No [Inap in V502-V505]
  2291. DRINKED = line[262:263]
  2292. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  2293. # 3. vigorous (vigorous activity more than once a week)
  2294. # 2. moderate (moderate activity more than once a week)
  2295. # 1. inactive (the rest)
  2296. # 体重
  2297. WEIGH = line[273:276].strip()
  2298. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
  2299. # 身高
  2300. HEIGHT_FEET = line[282:283].strip()
  2301. HEIGHT_INCHES = line[283:288]
  2302. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  2303. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
  2304. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  2305. # BMI
  2306. BMI = WEIGH / math.pow(HEIGHT,2)
  2307. else:
  2308. HEIGHT = float(HEIGHT_FEET)*0.3048
  2309. # BMI
  2310. BMI = WEIGH / math.pow(HEIGHT,2)
  2311. else :
  2312. BMI = np.nan
  2313. # 1. YES
  2314. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  2315. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  2316. # 5. NO
  2317. # 8. DK (don't know); NA (not ascertained)
  2318. # 9. RF (refused)
  2319. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  2320. # 1. Yes
  2321. # 5. No [Inap in V418-V421]
  2322. HEART_PROBL = line[67:68]
  2323. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
  2324. # 1. YES
  2325. # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
  2326. # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
  2327. # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
  2328. # 5. NO
  2329. # 8. DK (don't know); NA (not ascertained)
  2330. # 9. RF (refused)
  2331. # STROKE
  2332. # 1. Yes
  2333. # 5. No [Inap in V418-V421]
  2334. STROKE = line[123:124]
  2335. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  2336. # 1. MORE THAN ONCE A WEEK
  2337. # 2. ONCE A WEEK
  2338. # 3. ONCE TO THREE TIMES A MONTH
  2339. # 4. HARDLY EVER OR NEVER
  2340. # 7. (VOL) EVERY DAY
  2341. # 8. DK (Don't Know)
  2342. # 9. RF (Refused)
  2343. # 3. vigorous (vigorous activity more than once a week)
  2344. # 2. moderate (moderate activity more than once a week)
  2345. # 1. inactive (the rest)
  2346. # 重度活动
  2347. VIGOROUS_PHYSICAL = line[234:235]
  2348. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  2349. #中度活动
  2350. MODERATE_PHYSICAL = line[235:236]
  2351. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  2352. # 轻度活动
  2353. LIGHT_PHYSICAL = np.nan
  2354. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  2355. HHID_list.append(HHID)
  2356. PN_list.append(PN)
  2357. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  2358. SMOKED_list.append(SMOKED)
  2359. DRINKED_list.append(DRINKED)
  2360. BMI_list.append(BMI)
  2361. HEART_PROBL_list.append(HEART_PROBL)
  2362. STROKE_list.append(STROKE)
  2363. data = {
  2364. "HHID":HHID_list,
  2365. "PN":PN_list,
  2366. "SMOKED":SMOKED_list,
  2367. "DRINKED":DRINKED_list,
  2368. "BMI":BMI_list,
  2369. "HEART_PROBL":HEART_PROBL_list,
  2370. "STROKE":STROKE_list,
  2371. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  2372. }
  2373. result_2016_two = pd.DataFrame(data)
  2374. result_2016 = pd.merge(result_2016_one, result_2016_two, on=["HHID", "PN"], how="left")
  2375. with(open("/root/r_base/HRS/2016/data/H16B_R.da", "r", encoding="utf-8") )as file:
  2376. HHID_list = []
  2377. PN_list = []
  2378. MARITAL_STATUS_list = []
  2379. EDUCATION_list = []
  2380. # 逐行读取文件
  2381. for line in file:
  2382. HHID = line[0:6]
  2383. PN = line[6:9]
  2384. # 1. MARRIED (VOL)
  2385. # 2. ANULLED (VOL)
  2386. # 3. SEPARATED
  2387. # 4. DIVORCED
  2388. # 5. WIDOWED
  2389. # 6. NEVER MARRIED
  2390. # 7. OTHER (SPECIFY)
  2391. # 8. DK (Don't Know); NA (Not Ascertained)
  2392. # 9. RF (Refused)
  2393. MARITAL_STATUS = line[275:276]
  2394. # 1 Married or Partner; 5 other
  2395. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  2396. # 0 For no formal education
  2397. # 1-11 .....Grades
  2398. # 12 .......High school
  2399. # 13-15 ....Some college
  2400. # 16 .......College grad
  2401. # 17 .......Post college (17+ years)
  2402. # 97 .......Other
  2403. # 98. DK (don't know); NA (not ascertained)
  2404. # 99. RF (refused)
  2405. EDUCATION = line[39:41]
  2406. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  2407. HHID_list.append(HHID)
  2408. PN_list.append(PN)
  2409. MARITAL_STATUS_list.append(MARITAL_STATUS)
  2410. EDUCATION_list.append(EDUCATION)
  2411. data = {
  2412. "HHID":HHID_list,
  2413. "PN":PN_list,
  2414. "MARITAL_STATUS":MARITAL_STATUS_list,
  2415. "EDUCATION":EDUCATION_list
  2416. }
  2417. result_2016_four = pd.DataFrame(data)
  2418. result_2016 = pd.merge(result_2016, result_2016_four, on=["HHID", "PN"], how="left")
  2419. result = pd.concat([result, result_2016], axis=0)
  2420. # 获取2018数据
  2421. with(open("/root/r_base/HRS/2018/data/H18PR_R.da", "r", encoding="utf-8") )as file:
  2422. HHID_list = []
  2423. PN_list = []
  2424. BIRTH_YEAR_list = []
  2425. SEX_list = []
  2426. # 逐行读取文件
  2427. for line in file:
  2428. HHID = line[0:6]
  2429. PN = line[6:9]
  2430. BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
  2431. # 1.Male
  2432. # 2.Female
  2433. SEX = line[20:21] if not line[20:21] == "" else np.nan
  2434. HHID_list.append(HHID)
  2435. PN_list.append(PN)
  2436. BIRTH_YEAR_list.append(BIRTH_YEAR)
  2437. SEX_list.append(SEX)
  2438. data = {
  2439. "HHID":HHID_list,
  2440. "PN":PN_list,
  2441. "BIRTH_YEAR":BIRTH_YEAR_list,
  2442. "SEX":SEX_list,
  2443. }
  2444. data["WAVE"] = 2018
  2445. result_2018_one = pd.DataFrame(data)
  2446. with(open("/root/r_base/HRS/2018/data/H18C_R.da", "r", encoding="utf-8") )as file:
  2447. HHID_list = []
  2448. PN_list = []
  2449. SMOKED_list = []
  2450. DRINKED_list = []
  2451. BMI_list = []
  2452. HEART_PROBL_list = []
  2453. PHYSICAL_ACTIVITY_LEVEL_list = []
  2454. STROKE_list=[]
  2455. # 逐行读取文件
  2456. for line in file:
  2457. HHID = line[0:6]
  2458. PN = line[6:9]
  2459. # 1. YES
  2460. # 5. NO
  2461. SMOKED = line[381:382]
  2462. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  2463. # 1. YES
  2464. # 3. [VOL] NEVER HAVE USED ALCOHOL
  2465. # 5. NO
  2466. # 8. DK (don't know); NA (not ascertained)
  2467. # 9. RF (refused)
  2468. # 合并后
  2469. # 1. Yes
  2470. # 5. No [Inap in V502-V505]
  2471. DRINKED = line[404:406]
  2472. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  2473. # 3. vigorous (vigorous activity more than once a week)
  2474. # 2. moderate (moderate activity more than once a week)
  2475. # 1. inactive (the rest)
  2476. # 体重
  2477. WEIGH = line[418:421].strip()
  2478. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan
  2479. # 身高
  2480. HEIGHT_FEET = line[428:430].strip()
  2481. HEIGHT_INCHES = line[430:435].strip()
  2482. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  2483. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES == "99" and not HEIGHT_INCHES=="":
  2484. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  2485. # BMI
  2486. BMI = WEIGH / math.pow(HEIGHT,2)
  2487. else:
  2488. HEIGHT = float(HEIGHT_FEET)*0.3048
  2489. # BMI
  2490. BMI = WEIGH / math.pow(HEIGHT,2)
  2491. else :
  2492. BMI = np.nan
  2493. # -8. Web non-response
  2494. # 1. YES
  2495. # 4. [NEVER HAD HEART PROBLEM]
  2496. # 5. NO
  2497. # 6. [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT
  2498. # TAKING MEDICATION FOR IT]
  2499. # 8. DK (Don't Know); NA (Not Ascertained)
  2500. # 9. RF (Refused)
  2501. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  2502. # 1. Yes
  2503. # 5. No [Inap in V418-V421]
  2504. HEART_PROBL = line[86:88]
  2505. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5"
  2506. # -8. Web non-response
  2507. # 1. YES
  2508. # 2. [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC
  2509. # ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)]
  2510. # 4. [NEVER HAD A STROKE]
  2511. # 5. NO
  2512. # 8. DK (Don't Know); NA (Not Ascertained)
  2513. # 9. RF (Refused)
  2514. # STROKE
  2515. # 1. Yes
  2516. # 5. No [Inap in V418-V421]
  2517. STROKE = line[162:164]
  2518. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  2519. # 1. MORE THAN ONCE A WEEK
  2520. # 2. ONCE A WEEK
  2521. # 3. ONCE TO THREE TIMES A MONTH
  2522. # 4. HARDLY EVER OR NEVER
  2523. # 7. (VOL) EVERY DAY
  2524. # 8. DK (Don't Know)
  2525. # 9. RF (Refused)
  2526. # 3. vigorous (vigorous activity more than once a week)
  2527. # 2. moderate (moderate activity more than once a week)
  2528. # 1. inactive (the rest)
  2529. # 重度活动
  2530. VIGOROUS_PHYSICAL = line[367:369]
  2531. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  2532. #中度活动
  2533. MODERATE_PHYSICAL = line[369:371]
  2534. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  2535. # 轻度活动
  2536. LIGHT_PHYSICAL = np.nan
  2537. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  2538. HHID_list.append(HHID)
  2539. PN_list.append(PN)
  2540. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  2541. SMOKED_list.append(SMOKED)
  2542. DRINKED_list.append(DRINKED)
  2543. BMI_list.append(BMI)
  2544. HEART_PROBL_list.append(HEART_PROBL)
  2545. STROKE_list.append(STROKE)
  2546. data = {
  2547. "HHID":HHID_list,
  2548. "PN":PN_list,
  2549. "SMOKED":SMOKED_list,
  2550. "DRINKED":DRINKED_list,
  2551. "BMI":BMI_list,
  2552. "HEART_PROBL":HEART_PROBL_list,
  2553. "STROKE":STROKE_list,
  2554. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  2555. }
  2556. result_2018_two = pd.DataFrame(data)
  2557. result_2018 = pd.merge(result_2018_one, result_2018_two, on=["HHID", "PN"], how="left")
  2558. with(open("/root/r_base/HRS/2018/data/H18B_R.da", "r", encoding="utf-8") )as file:
  2559. HHID_list = []
  2560. PN_list = []
  2561. MARITAL_STATUS_list = []
  2562. EDUCATION_list = []
  2563. # 逐行读取文件
  2564. for line in file:
  2565. HHID = line[0:6]
  2566. PN = line[6:9]
  2567. # 1. MARRIED (VOL)
  2568. # 2. ANULLED (VOL)
  2569. # 3. SEPARATED
  2570. # 4. DIVORCED
  2571. # 5. WIDOWED
  2572. # 6. NEVER MARRIED
  2573. # 7. OTHER (SPECIFY)
  2574. # 8. DK (Don't Know); NA (Not Ascertained)
  2575. # 9. RF (Refused)
  2576. MARITAL_STATUS = line[287:288]
  2577. # 1 Married or Partner; 5 other
  2578. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  2579. # 0 For no formal education
  2580. # 1-11 .....Grades
  2581. # 12 .......High school
  2582. # 13-15 ....Some college
  2583. # 16 .......College grad
  2584. # 17 .......Post college (17+ years)
  2585. # 97 .......Other
  2586. # 98. DK (don't know); NA (not ascertained)
  2587. # 99. RF (refused)
  2588. EDUCATION = line[41:43]
  2589. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  2590. HHID_list.append(HHID)
  2591. PN_list.append(PN)
  2592. MARITAL_STATUS_list.append(MARITAL_STATUS)
  2593. EDUCATION_list.append(EDUCATION)
  2594. data = {
  2595. "HHID":HHID_list,
  2596. "PN":PN_list,
  2597. "MARITAL_STATUS":MARITAL_STATUS_list,
  2598. "EDUCATION":EDUCATION_list
  2599. }
  2600. result_2018_four = pd.DataFrame(data)
  2601. result_2018 = pd.merge(result_2018, result_2018_four, on=["HHID", "PN"], how="left")
  2602. result = pd.concat([result, result_2018], axis=0)
  2603. # 获取2020数据
  2604. with(open("/root/r_base/HRS/2020/data/H20PR_R.da", "r", encoding="utf-8") )as file:
  2605. HHID_list = []
  2606. PN_list = []
  2607. BIRTH_YEAR_list = []
  2608. SEX_list = []
  2609. # 逐行读取文件
  2610. for line in file:
  2611. HHID = line[0:6]
  2612. PN = line[6:9]
  2613. BIRTH_YEAR = line[36:40] if not line[36:40] == "" else np.nan
  2614. # 1.Male
  2615. # 2.Female
  2616. SEX = line[33:34] if not line[33:34] == "" else np.nan
  2617. HHID_list.append(HHID)
  2618. PN_list.append(PN)
  2619. BIRTH_YEAR_list.append(BIRTH_YEAR)
  2620. SEX_list.append(SEX)
  2621. data = {
  2622. "HHID":HHID_list,
  2623. "PN":PN_list,
  2624. "BIRTH_YEAR":BIRTH_YEAR_list,
  2625. "SEX":SEX_list,
  2626. }
  2627. data["WAVE"] = 2020
  2628. result_2020_one = pd.DataFrame(data)
  2629. with(open("/root/r_base/HRS/2020/data/H20C_R.da", "r", encoding="utf-8") )as file:
  2630. HHID_list = []
  2631. PN_list = []
  2632. SMOKED_list = []
  2633. DRINKED_list = []
  2634. BMI_list = []
  2635. HEART_PROBL_list = []
  2636. PHYSICAL_ACTIVITY_LEVEL_list = []
  2637. STROKE_list=[]
  2638. # 逐行读取文件
  2639. for line in file:
  2640. HHID = line[0:6]
  2641. PN = line[6:9]
  2642. # 1. YES
  2643. # 5. NO
  2644. SMOKED = line[339:340]
  2645. SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
  2646. # 1. YES
  2647. # 3. [VOL] NEVER HAVE USED ALCOHOL
  2648. # 5. NO
  2649. # 8. DK (don't know); NA (not ascertained)
  2650. # 9. RF (refused)
  2651. # 合并后
  2652. # 1. Yes
  2653. # 5. No [Inap in V502-V505]
  2654. DRINKED = line[363:365]
  2655. DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
  2656. # 3. vigorous (vigorous activity more than once a week)
  2657. # 2. moderate (moderate activity more than once a week)
  2658. # 1. inactive (the rest)
  2659. # 体重
  2660. WEIGH = line[380:383].strip()
  2661. WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan
  2662. # 身高
  2663. HEIGHT_FEET = line[389:390].strip()
  2664. HEIGHT_INCHES = line[390:395].strip()
  2665. if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
  2666. if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
  2667. HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
  2668. # BMI
  2669. BMI = WEIGH / math.pow(HEIGHT,2)
  2670. else:
  2671. HEIGHT = float(HEIGHT_FEET)*0.3048
  2672. # BMI
  2673. BMI = WEIGH / math.pow(HEIGHT,2)
  2674. else :
  2675. BMI = np.nan
  2676. # -8. Web non-response
  2677. # 1. YES
  2678. # 4. [NEVER HAD HEART PROBLEM]
  2679. # 5. NO
  2680. # 6. [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT
  2681. # TAKING MEDICATION FOR IT]
  2682. # 8. DK (Don't Know); NA (Not Ascertained)
  2683. # 9. RF (Refused)
  2684. # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
  2685. # 1. Yes
  2686. # 5. No [Inap in V418-V421]
  2687. HEART_PROBL = line[73:75]
  2688. HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5"
  2689. # -8. Web non-response
  2690. # 1. YES
  2691. # 2. [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC
  2692. # ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)]
  2693. # 4. [NEVER HAD A STROKE]
  2694. # 5. NO
  2695. # 8. DK (Don't Know); NA (Not Ascertained)
  2696. # 9. RF (Refused)
  2697. # STROKE
  2698. # 1. Yes
  2699. # 5. No [Inap in V418-V421]
  2700. STROKE = line[138:140]
  2701. STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
  2702. # 1. MORE THAN ONCE A WEEK
  2703. # 2. ONCE A WEEK
  2704. # 3. ONCE TO THREE TIMES A MONTH
  2705. # 4. HARDLY EVER OR NEVER
  2706. # 7. (VOL) EVERY DAY
  2707. # 8. DK (Don't Know)
  2708. # 9. RF (Refused)
  2709. # 3. vigorous (vigorous activity more than once a week)
  2710. # 2. moderate (moderate activity more than once a week)
  2711. # 1. inactive (the rest)
  2712. # 重度活动
  2713. VIGOROUS_PHYSICAL = line[325:327]
  2714. VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
  2715. #中度活动
  2716. MODERATE_PHYSICAL = line[327:329]
  2717. MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
  2718. # 轻度活动
  2719. LIGHT_PHYSICAL = np.nan
  2720. PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
  2721. HHID_list.append(HHID)
  2722. PN_list.append(PN)
  2723. PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
  2724. SMOKED_list.append(SMOKED)
  2725. DRINKED_list.append(DRINKED)
  2726. BMI_list.append(BMI)
  2727. HEART_PROBL_list.append(HEART_PROBL)
  2728. STROKE_list.append(STROKE)
  2729. data = {
  2730. "HHID":HHID_list,
  2731. "PN":PN_list,
  2732. "SMOKED":SMOKED_list,
  2733. "DRINKED":DRINKED_list,
  2734. "BMI":BMI_list,
  2735. "HEART_PROBL":HEART_PROBL_list,
  2736. "STROKE":STROKE_list,
  2737. "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
  2738. }
  2739. result_2020_two = pd.DataFrame(data)
  2740. result_2020 = pd.merge(result_2020_one, result_2020_two, on=["HHID", "PN"], how="left")
  2741. with(open("/root/r_base/HRS/2020/data/H20B_R.da", "r", encoding="utf-8") )as file:
  2742. HHID_list = []
  2743. PN_list = []
  2744. MARITAL_STATUS_list = []
  2745. EDUCATION_list = []
  2746. # 逐行读取文件
  2747. for line in file:
  2748. HHID = line[0:6]
  2749. PN = line[6:9]
  2750. # 1. MARRIED (VOL)
  2751. # 2. ANULLED (VOL)
  2752. # 3. SEPARATED
  2753. # 4. DIVORCED
  2754. # 5. WIDOWED
  2755. # 6. NEVER MARRIED
  2756. # 7. OTHER (SPECIFY)
  2757. # 8. DK (Don't Know); NA (Not Ascertained)
  2758. # 9. RF (Refused)
  2759. MARITAL_STATUS = line[304:305]
  2760. # 1 Married or Partner; 5 other
  2761. MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
  2762. # 0 For no formal education
  2763. # 1-11 .....Grades
  2764. # 12 .......High school
  2765. # 13-15 ....Some college
  2766. # 16 .......College grad
  2767. # 17 .......Post college (17+ years)
  2768. # 97 .......Other
  2769. # 98. DK (don't know); NA (not ascertained)
  2770. # 99. RF (refused)
  2771. EDUCATION = line[40:42]
  2772. EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
  2773. HHID_list.append(HHID)
  2774. PN_list.append(PN)
  2775. MARITAL_STATUS_list.append(MARITAL_STATUS)
  2776. EDUCATION_list.append(EDUCATION)
  2777. data = {
  2778. "HHID":HHID_list,
  2779. "PN":PN_list,
  2780. "MARITAL_STATUS":MARITAL_STATUS_list,
  2781. "EDUCATION":EDUCATION_list
  2782. }
  2783. result_2020_four = pd.DataFrame(data)
  2784. result_2020 = pd.merge(result_2020, result_2020_four, on=["HHID", "PN"], how="left")
  2785. result = pd.concat([result, result_2020], axis=0)
  2786. result.to_csv("/root/r_base/HRS/result_all.csv", index=False)