123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- import pandas as pd
- def get_smoked():
- #获取所有人的出生年月
- HHID_list = []
- PN_list = []
- SMOKED_list = []
- #98年加入
- with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
- # 逐行读取文件
- for line in file:
- HHID = line[0:6]
- PN = line[6:9]
- BORN_YEAR = line[66:70]
- SEX = line[70:71] #1.Male 2.Female
- HHID_list.append(HHID)
- PN_list.append(PN)
- BORN_YEAR_list.append(BORN_YEAR)
- SEX_list.append(SEX)
- print(line, end='') # end='' 用来避免多余的换行
- #04年加入
- with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
- # 逐行读取文件
- for line in file:
- HHID = line[0:6]
- PN = line[6:9]
- BORN_YEAR = line[25:29]
- SEX = line[20:21] #1.Male 2.Female
- HHID_list.append(HHID)
- PN_list.append(PN)
- BORN_YEAR_list.append(BORN_YEAR)
- SEX_list.append(SEX)
- print(line, end='') # end='' 用来避免多余的换行
- data = {
- "HHID":HHID_list,
- "PN":PN_list,
- "BORN_YEAR":BORN_YEAR_list,
- "SEX":SEX_list
- }
- result = pd.DataFrame(data)
- print(result.info())
- #进行去重处理
- result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
- print(result.info())
- result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False)
- if __name__ == "__main__":
- # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8")
- #2006数据
- year = "20"
- wave = "06"
- # 获取06年之前加入人员的信息
- with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
- HHID_list = []
- PN_list = []
- BORN_YEAR_list = []
- SEX_list = []
- MARITAL_STATUS_list = []
- EDUCATION_list = []
- # 逐行读取文件
- for line in file:
- HHID = line[0:6]
- PN = line[6:9]
- BORN_YEAR = line[25:29]
- # 1.Male
- # 2.Female
- SEX = line[20:21]
- # 0. UNKNOWN
- # 1. MARRIED
- # 2. MARRIED SP ABSENT (IN INSTITUTION)
- # 3. MARRIED SP ABSENT (NOT IN INSTITUTION)
- # 4. DIVORCED/SEPARATED
- # 5. WIDOWED
- # 6. NEVER MARRIED
- MARITAL_STATUS = line[106:107]
- # 0 For no formal education
- # 1-11 .....Grades
- # 12 .......High school
- # 13-15 ....Some college
- # 16 .......College grad
- # 17 .......Post college (17+ years)
- # 97 .......Other
- EDUCATION = line[585:587]
- HHID_list.append(HHID)
- PN_list.append(PN)
- BORN_YEAR_list.append(BORN_YEAR)
- SEX_list.append(SEX)
- MARITAL_STATUS_list.append(MARITAL_STATUS)
- EDUCATION_list.append(EDUCATION)
- print(line, end='') # end='' 用来避免多余的换行
- data = {
- "HHID":HHID_list,
- "PN":PN_list,
- "BORN_YEAR":BORN_YEAR_list,
- "SEX":SEX_list,
- "MARITAL_STATUS":MARITAL_STATUS_list,
- "EDUCATION":EDUCATION_list
- }
- data["WAVE"] = 2006
- result = pd.DataFrame(data)
- # 将06年新加入的人员合并入数据
- with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
- HHID_list = []
- PN_list = []
- DRINK_list = []
- # 逐行读取文件
- for line in file:
- HHID = line[0:6]
- PN = line[6:9]
- #饮酒
- # 1. YES
- # 3. [VOL] NEVER HAVE USED ALCOHOL
- # 5. NO
- # 8. DK (Don't Know); NA (Not Ascertained)
- # 9. RF (Refused)
- # Blank. INAP (Inapplicable); Partial Interview
- DRINK = line[207:208]
- HHID_list.append(HHID)
- PN_list.append(PN)
- DRINK_list.append(DRINK)
- print(line, end='') # end='' 用来避免多余的换行
- data = {
- "HHID":HHID_list,
- "PN":PN_list,
- "EDUCATION":EDUCATION_list
- }
- result = pd.DataFrame(data)
- print(result.info())
|