Browse Source

HRS处理数据库

root 3 tháng trước cách đây
mục cha
commit
d901bf715c
1 tập tin đã thay đổi với 83 bổ sung34 xóa
  1. 83 34
      HRS_P/HRS_preprocess.py

+ 83 - 34
HRS_P/HRS_preprocess.py

@@ -1,42 +1,10 @@
 import pandas as pd
 
-
-#HHID、PN
-
-
-
-if __name__ == "__main__":
+def get_smoked():
     #获取所有人的出生年月
     HHID_list = []
     PN_list = []
-    BORN_YEAR_list = []
-    SEX_list = []
-    #1992加入人群
-    with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
-        # 逐行读取文件
-        for line in file:
-            HHID = line[0:6]
-            PN = line[6:9]
-            BORN_YEAR = line[99:104]
-            SEX = line[109:110]   #1.Male 2.Female
-            HHID_list.append(HHID)
-            PN_list.append(PN)
-            BORN_YEAR_list.append(BORN_YEAR)
-            SEX_list.append(SEX)
-            print(line, end='')  # end='' 用来避免多余的换行
-    #93年加入
-    with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
-        # 逐行读取文件
-        for line in file:
-            HHID = line[0:6]
-            PN = line[6:9]
-            BORN_YEAR = line[61:65]
-            SEX = line[16:17]   #1.Male 2.Female
-            HHID_list.append(HHID)
-            PN_list.append(PN)
-            BORN_YEAR_list.append(BORN_YEAR)
-            SEX_list.append(SEX)
-            print(line, end='')  # end='' 用来避免多余的换行
+    SMOKED_list = []
     #98年加入
     with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
         # 逐行读取文件
@@ -74,5 +42,86 @@ if __name__ == "__main__":
     #进行去重处理
     result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
     print(result.info())
+    result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False)
 
+if __name__ == "__main__":
+    # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8")
+    #2006数据
+    year = "20"
+    wave = "06"
+    # 获取06年之前加入人员的信息
+    with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BORN_YEAR_list = []
+        SEX_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BORN_YEAR = line[25:29]
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21]   
+            # 0.  UNKNOWN
+            # 1.  MARRIED
+            # 2.  MARRIED SP ABSENT (IN INSTITUTION)
+            # 3.  MARRIED SP ABSENT (NOT IN INSTITUTION)
+            # 4.  DIVORCED/SEPARATED
+            # 5.  WIDOWED
+            # 6.  NEVER MARRIED
+            MARITAL_STATUS = line[106:107]
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college (17+ years)
+            # 97 .......Other
+            EDUCATION = line[585:587]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BORN_YEAR_list.append(BORN_YEAR)
+            SEX_list.append(SEX)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+            print(line, end='')  # end='' 用来避免多余的换行
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "EDUCATION":EDUCATION_list
+        }
+        data["WAVE"] = 2006
+        result = pd.DataFrame(data)
+    # 将06年新加入的人员合并入数据
+    with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        DRINK_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            #饮酒
+            #      1.  YES
+            #      3.  [VOL] NEVER HAVE USED ALCOHOL
+            #      5.  NO
+            #      8.  DK (Don't Know); NA (Not Ascertained)
+            #      9.  RF (Refused)
+            #  Blank.  INAP (Inapplicable); Partial Interview
+            DRINK = line[207:208]
+
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            DRINK_list.append(DRINK)
+            print(line, end='')  # end='' 用来避免多余的换行
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result = pd.DataFrame(data)
+        print(result.info())