HRS_preprocess.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import pandas as pd
  2. #HHID、PN
  3. if __name__ == "__main__":
  4. #获取所有人的出生年月
  5. HHID_list = []
  6. PN_list = []
  7. BORN_YEAR_list = []
  8. SEX_list = []
  9. #1992加入人群
  10. with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
  11. # 逐行读取文件
  12. for line in file:
  13. HHID = line[0:6]
  14. PN = line[6:9]
  15. BORN_YEAR = line[99:104]
  16. SEX = line[109:110] #1.Male 2.Female
  17. HHID_list.append(HHID)
  18. PN_list.append(PN)
  19. BORN_YEAR_list.append(BORN_YEAR)
  20. SEX_list.append(SEX)
  21. print(line, end='') # end='' 用来避免多余的换行
  22. #93年加入
  23. with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
  24. # 逐行读取文件
  25. for line in file:
  26. HHID = line[0:6]
  27. PN = line[6:9]
  28. BORN_YEAR = line[61:65]
  29. SEX = line[16:17] #1.Male 2.Female
  30. HHID_list.append(HHID)
  31. PN_list.append(PN)
  32. BORN_YEAR_list.append(BORN_YEAR)
  33. SEX_list.append(SEX)
  34. print(line, end='') # end='' 用来避免多余的换行
  35. #98年加入
  36. with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
  37. # 逐行读取文件
  38. for line in file:
  39. HHID = line[0:6]
  40. PN = line[6:9]
  41. BORN_YEAR = line[66:70]
  42. SEX = line[70:71] #1.Male 2.Female
  43. HHID_list.append(HHID)
  44. PN_list.append(PN)
  45. BORN_YEAR_list.append(BORN_YEAR)
  46. SEX_list.append(SEX)
  47. print(line, end='') # end='' 用来避免多余的换行
  48. #04年加入
  49. with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
  50. # 逐行读取文件
  51. for line in file:
  52. HHID = line[0:6]
  53. PN = line[6:9]
  54. BORN_YEAR = line[25:29]
  55. SEX = line[20:21] #1.Male 2.Female
  56. HHID_list.append(HHID)
  57. PN_list.append(PN)
  58. BORN_YEAR_list.append(BORN_YEAR)
  59. SEX_list.append(SEX)
  60. print(line, end='') # end='' 用来避免多余的换行
  61. data = {
  62. "HHID":HHID_list,
  63. "PN":PN_list,
  64. "BORN_YEAR":BORN_YEAR_list,
  65. "SEX":SEX_list
  66. }
  67. result = pd.DataFrame(data)
  68. print(result.info())
  69. #进行去重处理
  70. result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
  71. print(result.info())