|
@@ -1,42 +1,10 @@
|
|
|
import pandas as pd
|
|
|
|
|
|
-
|
|
|
-#HHID、PN
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
+def get_smoked():
|
|
|
#获取所有人的出生年月
|
|
|
HHID_list = []
|
|
|
PN_list = []
|
|
|
- BORN_YEAR_list = []
|
|
|
- SEX_list = []
|
|
|
- #1992加入人群
|
|
|
- with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
|
|
|
- # 逐行读取文件
|
|
|
- for line in file:
|
|
|
- HHID = line[0:6]
|
|
|
- PN = line[6:9]
|
|
|
- BORN_YEAR = line[99:104]
|
|
|
- SEX = line[109:110] #1.Male 2.Female
|
|
|
- HHID_list.append(HHID)
|
|
|
- PN_list.append(PN)
|
|
|
- BORN_YEAR_list.append(BORN_YEAR)
|
|
|
- SEX_list.append(SEX)
|
|
|
- print(line, end='') # end='' 用来避免多余的换行
|
|
|
- #93年加入
|
|
|
- with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
|
|
|
- # 逐行读取文件
|
|
|
- for line in file:
|
|
|
- HHID = line[0:6]
|
|
|
- PN = line[6:9]
|
|
|
- BORN_YEAR = line[61:65]
|
|
|
- SEX = line[16:17] #1.Male 2.Female
|
|
|
- HHID_list.append(HHID)
|
|
|
- PN_list.append(PN)
|
|
|
- BORN_YEAR_list.append(BORN_YEAR)
|
|
|
- SEX_list.append(SEX)
|
|
|
- print(line, end='') # end='' 用来避免多余的换行
|
|
|
+ SMOKED_list = []
|
|
|
#98年加入
|
|
|
with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
|
|
|
# 逐行读取文件
|
|
@@ -74,5 +42,86 @@ if __name__ == "__main__":
|
|
|
#进行去重处理
|
|
|
result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
|
|
|
print(result.info())
|
|
|
+ result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False)
|
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
+ # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8")
|
|
|
+ #2006数据
|
|
|
+ year = "20"
|
|
|
+ wave = "06"
|
|
|
+ # 获取06年之前加入人员的信息
|
|
|
+ with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
|
|
|
+ HHID_list = []
|
|
|
+ PN_list = []
|
|
|
+ BORN_YEAR_list = []
|
|
|
+ SEX_list = []
|
|
|
+ MARITAL_STATUS_list = []
|
|
|
+ EDUCATION_list = []
|
|
|
+ # 逐行读取文件
|
|
|
+ for line in file:
|
|
|
+ HHID = line[0:6]
|
|
|
+ PN = line[6:9]
|
|
|
+ BORN_YEAR = line[25:29]
|
|
|
+ # 1.Male
|
|
|
+ # 2.Female
|
|
|
+ SEX = line[20:21]
|
|
|
+ # 0. UNKNOWN
|
|
|
+ # 1. MARRIED
|
|
|
+ # 2. MARRIED SP ABSENT (IN INSTITUTION)
|
|
|
+ # 3. MARRIED SP ABSENT (NOT IN INSTITUTION)
|
|
|
+ # 4. DIVORCED/SEPARATED
|
|
|
+ # 5. WIDOWED
|
|
|
+ # 6. NEVER MARRIED
|
|
|
+ MARITAL_STATUS = line[106:107]
|
|
|
+ # 0 For no formal education
|
|
|
+ # 1-11 .....Grades
|
|
|
+ # 12 .......High school
|
|
|
+ # 13-15 ....Some college
|
|
|
+ # 16 .......College grad
|
|
|
+ # 17 .......Post college (17+ years)
|
|
|
+ # 97 .......Other
|
|
|
+ EDUCATION = line[585:587]
|
|
|
+ HHID_list.append(HHID)
|
|
|
+ PN_list.append(PN)
|
|
|
+ BORN_YEAR_list.append(BORN_YEAR)
|
|
|
+ SEX_list.append(SEX)
|
|
|
+ MARITAL_STATUS_list.append(MARITAL_STATUS)
|
|
|
+ EDUCATION_list.append(EDUCATION)
|
|
|
+ print(line, end='') # end='' 用来避免多余的换行
|
|
|
+ data = {
|
|
|
+ "HHID":HHID_list,
|
|
|
+ "PN":PN_list,
|
|
|
+ "EDUCATION":EDUCATION_list
|
|
|
+ }
|
|
|
+ data["WAVE"] = 2006
|
|
|
+ result = pd.DataFrame(data)
|
|
|
+ # 将06年新加入的人员合并入数据
|
|
|
+ with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
|
|
|
+ HHID_list = []
|
|
|
+ PN_list = []
|
|
|
+ DRINK_list = []
|
|
|
+ # 逐行读取文件
|
|
|
+ for line in file:
|
|
|
+ HHID = line[0:6]
|
|
|
+ PN = line[6:9]
|
|
|
+ #饮酒
|
|
|
+ # 1. YES
|
|
|
+ # 3. [VOL] NEVER HAVE USED ALCOHOL
|
|
|
+ # 5. NO
|
|
|
+ # 8. DK (Don't Know); NA (Not Ascertained)
|
|
|
+ # 9. RF (Refused)
|
|
|
+ # Blank. INAP (Inapplicable); Partial Interview
|
|
|
+ DRINK = line[207:208]
|
|
|
+
|
|
|
+ HHID_list.append(HHID)
|
|
|
+ PN_list.append(PN)
|
|
|
+ DRINK_list.append(DRINK)
|
|
|
+ print(line, end='') # end='' 用来避免多余的换行
|
|
|
+ data = {
|
|
|
+ "HHID":HHID_list,
|
|
|
+ "PN":PN_list,
|
|
|
+ "EDUCATION":EDUCATION_list
|
|
|
+ }
|
|
|
+ result = pd.DataFrame(data)
|
|
|
+ print(result.info())
|
|
|
|