import pandas as pd def get_smoked(): #获取所有人的出生年月 HHID_list = [] PN_list = [] SMOKED_list = [] #98年加入 with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[66:70] SEX = line[70:71] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 #04年加入 with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file: # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[25:29] SEX = line[20:21] #1.Male 2.Female HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) print(line, end='') # end='' 用来避免多余的换行 data = { "HHID":HHID_list, "PN":PN_list, "BORN_YEAR":BORN_YEAR_list, "SEX":SEX_list } result = pd.DataFrame(data) print(result.info()) #进行去重处理 result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True) print(result.info()) result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False) if __name__ == "__main__": # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8") #2006数据 year = "20" wave = "06" # 获取06年之前加入人员的信息 with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BORN_YEAR_list = [] SEX_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BORN_YEAR = line[25:29] # 1.Male # 2.Female SEX = line[20:21] # 0. UNKNOWN # 1. MARRIED # 2. MARRIED SP ABSENT (IN INSTITUTION) # 3. MARRIED SP ABSENT (NOT IN INSTITUTION) # 4. DIVORCED/SEPARATED # 5. WIDOWED # 6. NEVER MARRIED MARITAL_STATUS = line[106:107] # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other EDUCATION = line[585:587] HHID_list.append(HHID) PN_list.append(PN) BORN_YEAR_list.append(BORN_YEAR) SEX_list.append(SEX) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) print(line, end='') # end='' 用来避免多余的换行 data = { "HHID":HHID_list, "PN":PN_list, "EDUCATION":EDUCATION_list } data["WAVE"] = 2006 result = pd.DataFrame(data) # 将06年新加入的人员合并入数据 with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] DRINK_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] #饮酒 # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) # Blank. INAP (Inapplicable); Partial Interview DRINK = line[207:208] HHID_list.append(HHID) PN_list.append(PN) DRINK_list.append(DRINK) print(line, end='') # end='' 用来避免多余的换行 data = { "HHID":HHID_list, "PN":PN_list, "EDUCATION":EDUCATION_list } result = pd.DataFrame(data) print(result.info())