import pandas as pd #HHID、PN if __name__ == "__main__": #获取所有人的出生年月 HHID_list = [] PN_list = [] BORN_YEAR_list = [] SEX_list = [] #1992加入人群 with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[99:104] SEX = line[109:110] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 #93年加入 with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[61:65] SEX = line[16:17] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 #98年加入 with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[66:70] SEX = line[70:71] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 #04年加入 with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[25:29] SEX = line[20:21] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 data = { "HHID":HHID_list, "PN":PN_list, "BORN_YEAR":BORN_YEAR_list, "SEX":SEX_list } result = pd.DataFrame(data) print(result.info()) #进行去重处理 result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True) print(result.info())