소스 검색

处理HRS数据库

JazzZhao 4 달 전
부모
커밋
131c42e5e1
1개의 변경된 파일78개의 추가작업 그리고 0개의 파일을 삭제
  1. 78 0
      HRS_P/HRS_preprocess.py

+ 78 - 0
HRS_P/HRS_preprocess.py

@@ -0,0 +1,78 @@
+import pandas as pd
+
+
+#HHID、PN
+
+
+
+if __name__ == "__main__":
+    #获取所有人的出生年月
+    HHID_list = []
+    PN_list = []
+    BORN_YEAR_list = []
+    SEX_list = []
+    #1992加入人群
+    with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BORN_YEAR = line[99:104]
+            SEX = line[109:110]   #1.Male 2.Female
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BORN_YEAR_list.append(BORN_YEAR)
+            SEX_list.append(SEX)
+            print(line, end='')  # end='' 用来避免多余的换行
+    #93年加入
+    with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BORN_YEAR = line[61:65]
+            SEX = line[16:17]   #1.Male 2.Female
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BORN_YEAR_list.append(BORN_YEAR)
+            SEX_list.append(SEX)
+            print(line, end='')  # end='' 用来避免多余的换行
+    #98年加入
+    with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BORN_YEAR = line[66:70]
+            SEX = line[70:71]   #1.Male 2.Female
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BORN_YEAR_list.append(BORN_YEAR)
+            SEX_list.append(SEX)
+            print(line, end='')  # end='' 用来避免多余的换行
+    #04年加入
+    with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BORN_YEAR = line[25:29]
+            SEX = line[20:21]   #1.Male 2.Female
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BORN_YEAR_list.append(BORN_YEAR)
+            SEX_list.append(SEX)
+            print(line, end='')  # end='' 用来避免多余的换行
+    data = {
+        "HHID":HHID_list,
+        "PN":PN_list,
+        "BORN_YEAR":BORN_YEAR_list,
+        "SEX":SEX_list
+    }
+    result = pd.DataFrame(data)
+    print(result.info())
+    #进行去重处理
+    result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
+    print(result.info())
+
+