HRS_preprocess.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import pandas as pd
  2. def get_smoked():
  3. #获取所有人的出生年月
  4. HHID_list = []
  5. PN_list = []
  6. SMOKED_list = []
  7. #98年加入
  8. with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
  9. # 逐行读取文件
  10. for line in file:
  11. HHID = line[0:6]
  12. PN = line[6:9]
  13. BORN_YEAR = line[66:70]
  14. SEX = line[70:71] #1.Male 2.Female
  15. HHID_list.append(HHID)
  16. PN_list.append(PN)
  17. BORN_YEAR_list.append(BORN_YEAR)
  18. SEX_list.append(SEX)
  19. print(line, end='') # end='' 用来避免多余的换行
  20. #04年加入
  21. with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
  22. # 逐行读取文件
  23. for line in file:
  24. HHID = line[0:6]
  25. PN = line[6:9]
  26. BORN_YEAR = line[25:29]
  27. SEX = line[20:21] #1.Male 2.Female
  28. HHID_list.append(HHID)
  29. PN_list.append(PN)
  30. BORN_YEAR_list.append(BORN_YEAR)
  31. SEX_list.append(SEX)
  32. print(line, end='') # end='' 用来避免多余的换行
  33. data = {
  34. "HHID":HHID_list,
  35. "PN":PN_list,
  36. "BORN_YEAR":BORN_YEAR_list,
  37. "SEX":SEX_list
  38. }
  39. result = pd.DataFrame(data)
  40. print(result.info())
  41. #进行去重处理
  42. result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
  43. print(result.info())
  44. result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False)
  45. if __name__ == "__main__":
  46. # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8")
  47. #2006数据
  48. year = "20"
  49. wave = "06"
  50. # 获取06年之前加入人员的信息
  51. with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
  52. HHID_list = []
  53. PN_list = []
  54. BORN_YEAR_list = []
  55. SEX_list = []
  56. MARITAL_STATUS_list = []
  57. EDUCATION_list = []
  58. # 逐行读取文件
  59. for line in file:
  60. HHID = line[0:6]
  61. PN = line[6:9]
  62. BORN_YEAR = line[25:29]
  63. # 1.Male
  64. # 2.Female
  65. SEX = line[20:21]
  66. # 0. UNKNOWN
  67. # 1. MARRIED
  68. # 2. MARRIED SP ABSENT (IN INSTITUTION)
  69. # 3. MARRIED SP ABSENT (NOT IN INSTITUTION)
  70. # 4. DIVORCED/SEPARATED
  71. # 5. WIDOWED
  72. # 6. NEVER MARRIED
  73. MARITAL_STATUS = line[106:107]
  74. # 0 For no formal education
  75. # 1-11 .....Grades
  76. # 12 .......High school
  77. # 13-15 ....Some college
  78. # 16 .......College grad
  79. # 17 .......Post college (17+ years)
  80. # 97 .......Other
  81. EDUCATION = line[585:587]
  82. HHID_list.append(HHID)
  83. PN_list.append(PN)
  84. BORN_YEAR_list.append(BORN_YEAR)
  85. SEX_list.append(SEX)
  86. MARITAL_STATUS_list.append(MARITAL_STATUS)
  87. EDUCATION_list.append(EDUCATION)
  88. print(line, end='') # end='' 用来避免多余的换行
  89. data = {
  90. "HHID":HHID_list,
  91. "PN":PN_list,
  92. "BORN_YEAR":BORN_YEAR_list,
  93. "SEX":SEX_list,
  94. "MARITAL_STATUS":MARITAL_STATUS_list,
  95. "EDUCATION":EDUCATION_list
  96. }
  97. data["WAVE"] = 2006
  98. result = pd.DataFrame(data)
  99. # 将06年新加入的人员合并入数据
  100. with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
  101. HHID_list = []
  102. PN_list = []
  103. DRINK_list = []
  104. # 逐行读取文件
  105. for line in file:
  106. HHID = line[0:6]
  107. PN = line[6:9]
  108. #饮酒
  109. # 1. YES
  110. # 3. [VOL] NEVER HAVE USED ALCOHOL
  111. # 5. NO
  112. # 8. DK (Don't Know); NA (Not Ascertained)
  113. # 9. RF (Refused)
  114. # Blank. INAP (Inapplicable); Partial Interview
  115. DRINK = line[207:208]
  116. HHID_list.append(HHID)
  117. PN_list.append(PN)
  118. DRINK_list.append(DRINK)
  119. print(line, end='') # end='' 用来避免多余的换行
  120. data = {
  121. "HHID":HHID_list,
  122. "PN":PN_list,
  123. "EDUCATION":EDUCATION_list
  124. }
  125. result = pd.DataFrame(data)
  126. print(result.info())