Sfoglia il codice sorgente

增加ELSA和HRS数据库处理代码

root 3 mesi fa
parent
commit
6b30887fdd
4 ha cambiato i file con 3066 aggiunte e 72 eliminazioni
  1. 255 0
      ELSA_P/ELSA_preprocess.py
  2. 2731 72
      HRS_P/HRS_preprocess.py
  3. 3 0
      Medical相关工作.md
  4. 77 0
      test.py

+ 255 - 0
ELSA_P/ELSA_preprocess.py

@@ -0,0 +1,255 @@
+import pandas as pd
+import numpy as np
+
+
+#统一列名
+def change_columns(df):
+    df.columns = ["id", "birth_year", "sex", "marital_status", "smoking_status", "drinking_status", "vigoro_sports", "moderate_sports",
+                  "mild_sports","heart_probl", "BMI", "HbA1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","C_reactive_protein", "education"
+                  ]
+
+if __name__ == "__main__":
+
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
+    print(df.columns)
+    # age
+    # sex
+        # .u:Unmar
+        # .v:SP NR 
+        # 1.man
+        # 2.woman
+    # marital status
+        # .d:DK
+        # .r:Refuse
+        # 1.married
+        # 3.partnered
+        # 4.separated
+        # 5.divorced
+        # 7.widowed
+        # 8.never married
+    # education
+        # .d:DK 
+        # .m:Missing
+        # .o:other 
+        # .r:Refuse 
+        # 1.lt high-school
+        # 3.high-school graduate 
+        # 4.some college 
+        # 5.college and above
+    # smoking status
+        # .d:DK
+        # .m:Missing
+        # .p:proxy
+        # .r:Refuse
+        # 0.No 
+        # 1.Yes
+    # drinking status
+        # .c:no self-completion inter
+        # .d:DK
+        # .m:Missing
+        # .p:proxy
+        # .r:Refuse
+        # 0.no
+        # 1.yes
+    # physical activity level
+        # .d:DK
+        # .m:Missing
+        # .p:proxy 
+        # .r:Refuse 
+        # 2.> 1 per week
+        # 3.1 per week
+        # 4.1-3 per mon
+        # 5.hardly ever or never
+    # body mass index (BMI)
+    # heart_probl
+    # stroke
+    # glycated haemoglobin (HbA1c)
+    # systolic blood pressure (SBP)
+    # high-density lipoprotein cholesterol (HDL-C)
+    # C-reactive protein
+    # 定义需要检查的值
+    values_to_check = {1, 2, 3, 4, 5, 6, 7, 8, 95}
+    # wave 1
+    # 解析core文件
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_core_data_v3.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "heala","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07"]]
+    result["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result[['BMI',"hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_1_ifs_derived_variables.dta', convert_categoricals=False)
+    df.columns = df.columns.str.lower()
+    result_one = df[["idauniq", "edqual"]]
+    result = pd.merge(result, result_one, on=["idauniq"], how="left")
+
+    change_columns(result)
+    print(f"wave 1 finish {result.shape}")
+
+    # wave 2
+    # 解析core文件
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_core_data_v4.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_2 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim01", "hedim02", "hedim03", "hedim04", "hedim05", "hedim06", "hedim07","hedim08"]]
+    result_2["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    # 解析nurse
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_nurse_data_v2.dta', convert_categoricals=False)
+    df.columns = df.columns.str.lower()
+    result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
+    result_2 = pd.merge(result_2, result_two, on=["idauniq"], how="left")
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_2_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_2 = pd.merge(result_2, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_2)
+    result = pd.concat([result, result_2], axis=0)
+    print(f"wave 2 finish {result.shape}")
+
+    # wave 3
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_elsa_data_v4.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_3 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_3["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result_3[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_3_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_3 = pd.merge(result_3, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_3)
+    result = pd.concat([result, result_3], axis=0)
+    print(f"wave 3 finish {result.shape}")
+
+    # wave 4
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_elsa_data_v3.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_4 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_4["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    # 解析nurse
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_nurse_data.dta', convert_categoricals=False)
+    df.columns = df.columns.str.lower()
+    result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
+    result_4 = pd.merge(result_4, result_two, on=["idauniq"], how="left")
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_4_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_4 = pd.merge(result_4, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_4)
+    result = pd.concat([result, result_4], axis=0)
+    print(f"wave 4 finish {result.shape}")
+
+    # wave 5
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_elsa_data_v4.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_5 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_5["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result_5[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_5_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_5 = pd.merge(result_5, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_5)
+    result = pd.concat([result, result_5], axis=0)
+    print(f"wave 5 finish {result.shape}")
+
+    # wave 6
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_data_v2.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_6 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_6["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    # 解析nurse
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
+    df.columns = df.columns.str.lower()
+    result_two = df[["idauniq", "bmival", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
+    result_6 = pd.merge(result_6, result_two, on=["idauniq"], how="left")
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_6 = pd.merge(result_6, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_6)
+    result = pd.concat([result, result_6], axis=0)
+    print(f"wave 6 finish {result.shape}")
+
+    # wave 7
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_elsa_data.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_7 = df[['idauniq', "indobyr", "indsex", "dimar", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_7["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result_7[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_7_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_7 = pd.merge(result_7, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_7)
+    result = pd.concat([result, result_7], axis=0)
+    print(f"wave 7 finish {result.shape}")
+
+    # wave 8
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_data_eul_v2.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_8 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scako","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_8["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result_8['BMI'] = np.nan
+    # 解析nurse
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_6_elsa_nurse_data_v2.dta', convert_categoricals=False)
+    df.columns = df.columns.str.lower()
+    result_two = df[["idauniq", "hba1c","dias1","dias2","dias3","sys1","sys2","sys3","hdl","hscrp"]]
+    result_8 = pd.merge(result_8, result_two, on=["idauniq"], how="left")
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_8_elsa_ifs_dvs_eul_v1.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_8 = pd.merge(result_8, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_8)
+    result = pd.concat([result, result_8], axis=0)
+    print(f"wave 8 finish {result.shape}")
+
+    # wave 9
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
+    # 将列名统一转为小写
+    df.columns = df.columns.str.lower()
+    result_9 = df[['idauniq', "indobyr", "indsex", "dimarr", "hesmk", "scalcm","heacta", "heactb", "heactc"]]
+    #判断是否有心血管疾病
+    heart_row = df[["hedim85", "hediman", "hedimar", "hedimbp", "hedimch", "hedimdi", "hedimhf","hedimhm","hedimmi", "hedimst"]]
+    result_9["heart_probl"] = heart_row.apply(lambda row: 1 if any(val in values_to_check for val in row) else 0, axis=1)
+    result_9[['BMI', "hba1c","diastolic1","diastolic2","diastolic3","systolic1","systolic2","systolic3","hdl","hscrp"]] = np.nan
+    # 解析ifs_derived
+    df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
+    result_one = df[["idauniq", "edqual"]]
+    result_9 = pd.merge(result_9, result_one, on=["idauniq"], how="left")
+
+    change_columns(result_9)
+    result = pd.concat([result, result_9], axis=0)
+    print(f"wave 9 finish {result.shape}")
+
+    print(result.head())
+
+    result.to_csv("/root/r_base/UKDA-5050-stata/result_all.csv", index=False)

+ 2731 - 72
HRS_P/HRS_preprocess.py

@@ -1,59 +1,222 @@
 import pandas as pd
+import math
+import numpy as np
 
-def get_smoked():
-    #获取所有人的出生年月
-    HHID_list = []
-    PN_list = []
-    SMOKED_list = []
-    #98年加入
-    with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
+if __name__ == "__main__":
+    # age
+    # sex
+    # marital status
+    # education
+    # smoking status
+    # drinking status
+    # physical activity level
+    # body mass index (BMI)
+    # glycated haemoglobin (HbA1c)
+    # systolic blood pressure (SBP)
+    # high-density lipoprotein cholesterol (HDL-C)
+    # C-reactive protein
+
+    # 获取1992数据
+    with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]
         # 逐行读取文件
         for line in file:
             HHID = line[0:6]
             PN = line[6:9]
-            BORN_YEAR = line[66:70]
-            SEX = line[70:71]   #1.Male 2.Female
+            BIRTH_YEAR = line[249:254]
+            # 1.Male 
+            # 2.Female
+            SEX = line[109:110]
+            # 1.      Married [Inap in V228-V238]
+            # 2.      Partner [Inap in V226-V227]
+            # 3.      Separated [Inap in V226-V234]
+            # 4.      Divorced [Inap in V226-V234]
+            # 5.      Widowed [Inap in V226-V234]
+            # 6.      Never married
+            # 7.      Married with 2 family residences--both
+            #         sampleable
+            # 8.      Married with 2 family residences--one
+            #         residence is not sampleable (institution
+            #         or out of the country)
+            # 9.      NA
+            MARITAL_STATUS = line[302:303]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college (17+ years)
+            # 97 .......Other
+            EDUCATION = line[264:266]
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            SMOKED = line[519:520]
+            # 1.  Yes
+            # 5.  No [Inap in V507]
+            DRINKED = line[527:528]
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[534:535]
+            # 轻度活动
+            LIGHT_PHYSICAL = line[533:534]
+            PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL =="2" else 2 if LIGHT_PHYSICAL=="1" or LIGHT_PHYSICAL=="2" else 1
+            # 体重
+            WEIGH= float(line[536:539])*0.45359237
+            # 身高
+            HIGHT = float(line[542:543])*0.3048 + float(line[543:545])*0.0254
+            # BMI
+            BMI = WEIGH / math.pow(HIGHT,2)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[459:460]
+            # STROKE 
+            STROKE = line[473:474]
             HHID_list.append(HHID)
             PN_list.append(PN)
-            BORN_YEAR_list.append(BORN_YEAR)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
             SEX_list.append(SEX)
-            print(line, end='')  # end='' 用来避免多余的换行
-    #04年加入
-    with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        data["WAVE"] = 1992
+        result = pd.DataFrame(data)
+    # 获取1993数据
+    with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]
         # 逐行读取文件
         for line in file:
             HHID = line[0:6]
             PN = line[6:9]
-            BORN_YEAR = line[25:29]
-            SEX = line[20:21]   #1.Male 2.Female
+            BIRTH_YEAR = line[61:65]
+            # 1.Male 
+            # 2.Female
+            SEX = line[16:17]
+            # MARRIED, SPOUSE PRESENT...........  1  
+            # MARRIED, SPOUSE ABSENT............  2  
+            # LIVING WITH SOMEONE...............  3 GO TO A11b 
+            # DIVORCED/SEPARATED................  4 GO TO A11g 
+            # WIDOWED...........................  5 GO TO A11g 
+            # NEVER MARRIED.....................  6 GO TO B1 
+            MARITAL_STATUS = line[98:99]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5"
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college (17+ years)
+            # 97 .......Other
+            EDUCATION = line[74:76]
+            #CURRENT SMOKER.....................  1 
+            # FORMER SMOKER......................  2 GO TO B20 
+            # NEVER SMOKED.......................  3 GO TO B20 
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            SMOKED = "1" if line[172:173] == "1" or line[172:173] == "2" else "5"
+            # 1.  Yes
+            # 5.  No [Inap in V507]
+            DRINKED = line[176:177]
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = np.nan
+            # 体重
+            WEIGH= float(line[179:182])*0.45359237
+            # 身高
+            HEIGHT = float(line[182:184])*0.0254
+            # BMI
+            BMI = WEIGH / math.pow(HEIGHT,2)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[139:140]
+            # STROKE 
+            STROKE = line[142:143]
             HHID_list.append(HHID)
             PN_list.append(PN)
-            BORN_YEAR_list.append(BORN_YEAR)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
             SEX_list.append(SEX)
-            print(line, end='')  # end='' 用来避免多余的换行
-    data = {
-        "HHID":HHID_list,
-        "PN":PN_list,
-        "BORN_YEAR":BORN_YEAR_list,
-        "SEX":SEX_list
-    }
-    result = pd.DataFrame(data)
-    print(result.info())
-    #进行去重处理
-    result.drop_duplicates(subset=["HHID","PN"], keep="last", inplace=True)
-    print(result.info())
-    result.to_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8", index=False)
-
-if __name__ == "__main__":
-    # result_born_sex = pd.read_csv("/root/r_base/HRS/result_born_sex.csv", encoding="utf-8")
-    #2006数据
-    year = "20"
-    wave = "06"
-    # 获取06年之前加入人员的信息
-    with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        data["WAVE"] = 1993
+        result_1993 = pd.DataFrame(data)
+        result = pd.concat([result, result_1993], axis=0)
+    # 获取1994数据
+    with(open("/root/r_base/HRS/1994/data/W2a.da", "r", encoding="utf-8") )as file:
         HHID_list = []
         PN_list = []
-        BORN_YEAR_list = []
+        BIRTH_YEAR_list = []
         SEX_list = []
         MARITAL_STATUS_list = []
         EDUCATION_list = []
@@ -61,18 +224,22 @@ if __name__ == "__main__":
         for line in file:
             HHID = line[0:6]
             PN = line[6:9]
-            BORN_YEAR = line[25:29]
+            BIRTH_YEAR = line[26:30]
             # 1.Male 
             # 2.Female
-            SEX = line[20:21]   
-            # 0.  UNKNOWN
-            # 1.  MARRIED
-            # 2.  MARRIED SP ABSENT (IN INSTITUTION)
-            # 3.  MARRIED SP ABSENT (NOT IN INSTITUTION)
-            # 4.  DIVORCED/SEPARATED
-            # 5.  WIDOWED
-            # 6.  NEVER MARRIED
-            MARITAL_STATUS = line[106:107]
+            SEX = line[22:23]
+            # 1.      Married [Inap in V228-V238]
+            # 2.      Partner [Inap in V226-V227]
+            # 3.      Separated [Inap in V226-V234]
+            # 4.      Divorced [Inap in V226-V234]
+            # 5.      Widowed [Inap in V226-V234]
+            # 6.      Never married
+            # 7.      Married (Not Institutionalized/not out of country)
+            # 8.      Married (Institutionalized/out of country)
+            # 9.      NA
+            MARITAL_STATUS = line[55:57]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan
             # 0 For no formal education 
             # 1-11 .....Grades 
             # 12 .......High school 
@@ -80,51 +247,2543 @@ if __name__ == "__main__":
             # 16 .......College grad
             # 17 .......Post college (17+ years)
             # 97 .......Other
-            EDUCATION = line[585:587]
+            # 98.     Don't Know; DK
+            # 99.     Not Ascertained; NA
+            EDUCATION = line[112:115]
             HHID_list.append(HHID)
             PN_list.append(PN)
-            BORN_YEAR_list.append(BORN_YEAR)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
             SEX_list.append(SEX)
             MARITAL_STATUS_list.append(MARITAL_STATUS)
             EDUCATION_list.append(EDUCATION)
-            print(line, end='')  # end='' 用来避免多余的换行
         data = {
             "HHID":HHID_list,
             "PN":PN_list,
-            "BORN_YEAR":BORN_YEAR_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
             "SEX":SEX_list,
             "MARITAL_STATUS":MARITAL_STATUS_list,
-            "EDUCATION":EDUCATION_list
+            "EDUCATION":EDUCATION_list,
         }
-        data["WAVE"] = 2006
-        result = pd.DataFrame(data)
-    # 将06年新加入的人员合并入数据
-    with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
+        data["WAVE"] = 1994
+        result_1994_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/1994/data/W2B.DA", "r", encoding="utf-8") )as file:
         HHID_list = []
         PN_list = []
-        DRINK_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]        
         # 逐行读取文件
         for line in file:
             HHID = line[0:6]
             PN = line[6:9]
-            #饮酒
-            #      1.  YES
-            #      3.  [VOL] NEVER HAVE USED ALCOHOL
-            #      5.  NO
-            #      8.  DK (Don't Know); NA (Not Ascertained)
-            #      9.  RF (Refused)
-            #  Blank.  INAP (Inapplicable); Partial Interview
-            DRINK = line[207:208]
-
+            #1.      Yes
+            #5.      No [GO TO B41]
+            #8.      Don't Know; DK [GO TO B41]
+            #9.      Refused; RF [GO TO B41]
+            #0.      Inap.
+            #        Proxy interview for deceased Wave-1 R
+            SMOKED = line[356:358]
+            #1.      Yes
+            #5.      No [GO TO B41]
+            #8.      Don't Know; DK [GO TO B41]
+            #9.      Refused; RF [GO TO B41]
+            #0.      Inap.
+            #        Proxy interview for deceased Wave-1 R
+            DRINKED = line[367:369]
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 活动单位
+            # 02.     Week
+            # 04.     Month
+            # 06.     Year
+            # 07.     Other (specify)
+            # 11.     Day
+            # 98.     Don't Know/Not Ascertained; DK/NA
+            # 99.     Refused; RF
+            # 00.     Inap.
+            #         Proxy interview for deceased Wave-1 R
+            #         [B42a: or B42=995-999]
+            #         [B43a: or B43=995-999]
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[378:382]
+            VIGOROUS_UNIT = line[382:385]
+            VIGOROUS_PHYSICAL_FLAG = np.nan
+            if VIGOROUS_UNIT == "02" and float(VIGOROUS_PHYSICAL)>0 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
+                VIGOROUS_PHYSICAL_FLAG = True
+            if VIGOROUS_UNIT == "04" and float(VIGOROUS_PHYSICAL)>3 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
+                VIGOROUS_PHYSICAL_FLAG = True
+            if VIGOROUS_UNIT == "06" and float(VIGOROUS_PHYSICAL)>51 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995:
+                VIGOROUS_PHYSICAL_FLAG = True
+            if VIGOROUS_PHYSICAL=="00":
+                VIGOROUS_PHYSICAL_FLAG = False
+            # 轻度活动
+            LIGHT_PHYSICAL = line[371:375]
+            LIGHT_UNIT = line[375:378]
+            # 判断是否符合轻运动  1符合;0不符合
+            LIGHT_PHYSICAL_FLAG = np.nan
+            if LIGHT_UNIT == "02" and float(LIGHT_PHYSICAL)>0 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
+                LIGHT_PHYSICAL_FLAG = True
+            if LIGHT_UNIT == "04" and float(LIGHT_PHYSICAL)>3 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
+                LIGHT_PHYSICAL_FLAG = True
+            if LIGHT_UNIT == "06" and float(LIGHT_PHYSICAL)>51 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995:
+                LIGHT_PHYSICAL_FLAG = True
+            if LIGHT_PHYSICAL=="00":
+                LIGHT_PHYSICAL_FLAG = False
+            PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL_FLAG == True else 2 if LIGHT_PHYSICAL_FLAG==True else 1 if LIGHT_PHYSICAL_FLAG==False or VIGOROUS_PHYSICAL_FLAG==False else np.nan
+            # 体重
+            WEIGH= float(line[385:389])*0.45359237 if not float(line[385:389])>500 else np.nan
+            # 身高
+            HIGHT = float(line[389:392])*0.3048 + float(line[392:395])*0.0254 if not float(line[389:392])>95 and not float(line[392:395])>95 else np.nan
+            # BMI
+            BMI = WEIGH / math.pow(HIGHT,2) if not np.isnan(WEIGH) and not np.isnan(HIGHT) else np.nan
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[147:149] if not line[147:149]=="8" and not line[147:149]=="9" and not line[147:149]=="0" else np.nan
+            # STROKE 
+            STROKE = line[173:175] if not line[173:175]=="8" and not line[173:175]=="9" and not line[173:175]=="0" else np.nan
             HHID_list.append(HHID)
             PN_list.append(PN)
-            DRINK_list.append(DRINK)
-            print(line, end='')  # end='' 用来避免多余的换行
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
         data = {
             "HHID":HHID_list,
             "PN":PN_list,
-            "EDUCATION":EDUCATION_list
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
         }
-        result = pd.DataFrame(data)
-        print(result.info())
+        result_1994_two = pd.DataFrame(data)
+        result_1994 = pd.merge(result_1994_one, result_1994_two, on=["HHID", "PN"], how="left")
+        result = pd.concat([result, result_1994], axis=0)
+    print(result.head())
+    # 获取1995数据
+    with(open("/root/r_base/HRS/1995/data/A95A_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[30:34]
+            # MARRIED, SPOUSE PRESENT...........  1  
+            # MARRIED, SPOUSE ABSENT............  2  
+            # LIVING WITH SOMEONE...............  3 GO TO A11b 
+            # DIVORCED/SEPARATED................  4 GO TO A11g 
+            # WIDOWED...........................  5 GO TO A11g 
+            # NEVER MARRIED.....................  6 GO TO B1 
+            # 0. Exit proxy was taken before the interview with the surviving spouse.
+            # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION)
+            MARITAL_STATUS = line[76:77]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else np.nan if MARITAL_STATUS=="0" or MARITAL_STATUS=="7" else "5"
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[49:51] == "97" or line[49:51] == "98" or line[49:51] == "99" else line[49:51]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+        }
+        data["WAVE"] = 1995
+        result_1995_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/1995/data/A95CS_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1.Male 
+            # 2.Female
+            SEX = line[36:37] if not line[36:37]=="0" else np.nan
+        HHID_list.append(HHID)
+        PN_list.append(PN)
+        SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SEX":SEX_list,
+        }
+        result_1995_two = pd.DataFrame(data)
+        result_1995 = pd.merge(result_1995_one, result_1995_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/1995/data/A95B_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 2. Cigars
+            # 5. NO
+            # 7. Other
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            SMOKED = "1" if line[153:154] == "1" or line[153:154] == "2" else "5" if line[153:154] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 7. Other
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[157:158] == "1"  else "5" if line[157:158] == "5" or line[157:158] == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = np.nan
+            # 体重
+            WEIGH= float(line[164:167])*0.45359237 if not float(line[164:167])>400 else np.nan
+            # 身高
+            HIGHT = line[168:169]
+            if not line[168:169] == " " and not line[168:169] == "8" and not line[168:169] == "9" and not pd.isna(WEIGH):
+                HIGHT = float(line[168:169])*0.3048 + float(line[169:171])*0.0254
+                # BMI
+                BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. [VOL] DISPUTES W1 RECORD
+            # 5. NO
+            # 7. Other
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
 
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[63:64] == "8" else line[63:64]
+            # STROKE 
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[84:85] == "8" else "5" if line[84:85] == "2" or line[84:85] == "5" else line[84:85]
+            HHID_list.append(HHID)
+            PN_list.append(PN) 
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        result_1995_three = pd.DataFrame(data)
+        result_1995 = pd.merge(result_1995, result_1995_three, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_1995], axis=0)
+    # 获取1996数据
+    with(open("/root/r_base/HRS/1996/data/H96A_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29]
+            # MARRIED, SPOUSE PRESENT...........  1  
+            # MARRIED, SPOUSE ABSENT............  2  
+            # LIVING WITH SOMEONE...............  3 GO TO A11b 
+            # DIVORCED/SEPARATED................  4 GO TO A11g 
+            # WIDOWED...........................  5 GO TO A11g 
+            # NEVER MARRIED.....................  6 GO TO B1 
+            # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION)
+            MARITAL_STATUS = line[70:71]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5" if MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[43:45] == "97" or line[43:45] == "98" or line[43:45] == "99" else line[43:45]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+        }
+        data["WAVE"] = 1996
+        result_1996_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/1996/data/H96CS_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1.Male 
+            # 2.Female
+            SEX = line[74:75]
+        HHID_list.append(HHID)
+        PN_list.append(PN)
+        SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SEX":SEX_list,
+        }
+        result_1996_two = pd.DataFrame(data)
+        result_1996 = pd.merge(result_1996_one, result_1996_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/1996/data/H96B_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 2. Cigars
+            # 3. PIPE (IF VOLUNTEERED)
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            SMOKED = "1" if line[160:161] == "1" or line[160:161] == "2" or line[160:161] == "3" else "5" if line[160:161] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[166:167] == "1"  else "5" if line[166:167] == "5" or line[166:167] == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = np.nan
+            # 体重
+            WEIGH= float(line[174:177])*0.45359237 if not line[174:177].strip() =="" and not float(line[174:177])>400 else np.nan
+            # 身高
+            HIGHT = line[178:180].strip()
+            if not HIGHT == "" and not HIGHT == "98" and not HIGHT == "99" and not pd.isna(WEIGH):
+                HIGHT = float(HIGHT)*0.3048 + float(line[180:182])*0.0254
+                # BMI
+                BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. [VOL] DISPUTES W1 RECORD
+            # 5. NO
+            # 7. Other
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[66:67] == "9" or  line[66:67] == "" else line[66:67]
+            # STROKE 
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        result_1996_three = pd.DataFrame(data)
+        result_1996 = pd.merge(result_1996, result_1996_three, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_1996], axis=0)    
+    # 获取1998数据
+    with(open("/root/r_base/HRS/1998/data/H98A_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[31:35] if not line[31:35] == "9998" else np.nan
+            # 0. DK (don't know); NA (not ascertained); RF (refused)
+            # 1. MARRIED
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            MARITAL_STATUS = line[150:151]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[61:62] == "97" or line[61:62] == "98" or line[61:62] == "99" else line[61:62]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+        }
+        data["WAVE"] = 1998
+        result_1998_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1.Male 
+            # 2.Female
+            SEX = line[70:71]
+        HHID_list.append(HHID)
+        PN_list.append(PN)    
+        SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SEX":SEX_list,
+        }
+        result_1998_two = pd.DataFrame(data)
+        result_1998 = pd.merge(result_1998_one, result_1998_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/1998/data/H98B_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[157:158] if line[157:158] == "1" or line[157:158] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[184:185] == "1"  else "5" if line[184:185] == "5" or line[184:185] == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = np.nan
+            # 体重
+            WEIGH= float(line[196:199])*0.45359237 if not float(line[196:199])>400 else np.nan
+            # 身高
+            HEIGHT = line[200:202].strip()
+            if not HEIGHT == "" and not HEIGHT == "98" and not HEIGHT == "99" and not pd.isna(WEIGH):
+                if not line[202:204] == "98":
+                    HEIGHT = float(HEIGHT)*0.3048 + float(line[202:204])*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(line[200:202])*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD
+            # 5. NO
+            # 6. PRELOAD ERROR: Condition reported at prior wave but said no to
+            #    new event
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[66:67] == "8" else line[66:67]
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   3. [VOL] DISPUTES W1 RECORD
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        result_1998_three = pd.DataFrame(data)
+        result_1998 = pd.merge(result_1998, result_1998_three, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_1998], axis=0)       
+    # 获取2000数据
+    with(open("/root/r_base/HRS/2000/data/H00A_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[31:35]
+            # 0. DK (don't know); NA (not ascertained); RF (refused)
+            # 1. MARRIED
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            MARITAL_STATUS = line[152:153]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[63:65] == "97" or line[63:65] == "98" or line[63:65] == "99" else line[63:65]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list,
+        }
+        data["WAVE"] = 2000
+        result_2000_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2000/data/H00CS_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1.Male 
+            # 2.Female
+            SEX = line[79:80]
+        HHID_list.append(HHID)
+        PN_list.append(PN)
+        SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SEX":SEX_list,
+        }
+        result_2000_two = pd.DataFrame(data)
+        result_2000 = pd.merge(result_2000_one, result_2000_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2000/data/H00B_R.DA", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[154:155] if line[154:155] == "1" or line[154:155] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[181:182] == "1"  else "5" if line[181:182] == "5" or line[181:182] == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = np.nan
+            # 体重
+            WEIGH= float(line[193:196])*0.45359237 if not float(line[193:196])>400 else np.nan
+            # 身高
+            HEIGHT = line[197:198].strip()
+            if not HEIGHT == "" and not HEIGHT == "9" and not pd.isna(WEIGH):
+                HEIGHT = float(HEIGHT)*0.3048 + float(line[198:200])*0.0254
+                # BMI
+                BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[65:66] == "8" or line[65:66] == "9" else "1" if line[65:66] == "1" or line[65:66] == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[86:87] == "8" or line[86:87] == "9" else "5" if line[86:87] == "2" or line[86:87] == "4" or line[86:87] == "5" else "1"
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        result_2000_three = pd.DataFrame(data)
+        result_2000 = pd.merge(result_2000, result_2000_three, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2000], axis=0)      
+    # 获取2002数据
+    with(open("/root/r_base/HRS/2002/data/H02PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[36:40]
+            # 1.Male 
+            # 2.Female
+            SEX = line[19:20]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2002
+        result_2002_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2002/data/H02C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[155:156] if line[155:156] == "1" or line[155:156] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[179:180] == "1"  else "5" if line[179:180] == "5" or line[179:180] == "3" else np.nan
+            # 体重
+            WEIGH= float(line[190:193])*0.45359237 if not float(line[190:193])>400 else np.nan
+            # 身高
+            HEIGHT = line[194:195].strip()
+            if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH):
+                if not line[195:197] == "98" and not line[195:197]=="":
+                    HEIGHT = float(HEIGHT)*0.3048 + float(line[195:197])*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(line[194:195])*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[59:60] == "8" or line[59:60] == "9" else "1" if line[59:60] == "1" or line[59:60] == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[80:81] == "8" or line[80:81] == "9" else "5" if line[80:81] == "2" or line[80:81] == "4" or line[80:81] == "5" else "1"
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list
+        }
+        result_2002_two = pd.DataFrame(data)
+        result_2002 = pd.merge(result_2002_one, result_2002_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2002/data/H02V_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = "3" if line[151:152] == "1" or line[151:152] == "2" else "1" if line[151:152] == "3" or line[151:152] == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = "2" if line[152:153] == "1" or line[152:153] == "2" else "1" if line[152:153] == "3" or line[152:153] == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2002_three = pd.DataFrame(data)
+        result_2002 = pd.merge(result_2002, result_2002_three, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2002/data/H02B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. Married
+            # 3. Separated
+            # 4. Divorced
+            # 5. Widowed
+            # 6. Never Married
+            # 7. Other (Specify)
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            MARITAL_STATUS = line[131:132]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[42:44] == "97" or line[42:44] == "98" or line[42:44] == "99" else line[42:44]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2002_four = pd.DataFrame(data)
+        result_2002 = pd.merge(result_2002, result_2002_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2002], axis=0)          
+    # 获取2004数据
+    with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29]
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2004
+        result_2004_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2004/data/H04C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[167:168] if line[167:168] == "1" or line[167:168] == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = "1" if line[192:193] == "1"  else "5" if line[192:193] == "5" or line[192:193] == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH= float(line[203:206])*0.45359237 if not float(line[203:206])>400 else np.nan
+            # 身高
+            HEIGHT = line[222:223].strip()
+            if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH):
+                if not line[223:225] == "98" and not line[223:225]=="":
+                    HEIGHT = float(HEIGHT)*0.3048 + float(line[223:225])*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(line[222:223])*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = np.nan if line[69:70] == "8" or line[69:70] == "9" else "1" if line[69:70] == "1" or line[69:70] == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = np.nan if line[90:91] == "8" or line[90:91] == "9" else "5" if line[90:91] == "2" or line[90:91] == "4" or line[90:91] == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = "3" if line[164:165] == "1" or line[164:165] == "2" or line[164:165] == "7" else "1" if line[164:165] == "3" or line[164:165] == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = "2" if line[165:166] == "1" or line[165:166] == "2" or line[165:166] == "7" else "1" if line[165:166] == "3" or line[165:166] == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2004_two = pd.DataFrame(data)
+        result_2004 = pd.merge(result_2004_one, result_2004_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2004/data/H04B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            MARITAL_STATUS = line[161:162]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = np.nan if line[41:43] == "97" or line[41:43] == "98" or line[41:43] == "99" else line[41:43]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2004_four = pd.DataFrame(data)
+        result_2004 = pd.merge(result_2004, result_2004_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2004], axis=0)    
+    # 获取2006数据
+    with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29]
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2006
+        result_2006_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[181:182]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[207:208]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[218:221].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[252:253]
+            HEIGHT_INCHES = line[253:255]
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[80:81]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[101:102]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[178:179]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[179:180]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2006_two = pd.DataFrame(data)
+        result_2006 = pd.merge(result_2006_one, result_2006_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2006/data/H06B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[177:178]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[47:49]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2006_four = pd.DataFrame(data)
+        result_2006 = pd.merge(result_2006, result_2006_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2006], axis=0) 
+    # 获取2008数据
+    with(open("/root/r_base/HRS/2008/data/H08PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29]
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2008
+        result_2008_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2008/data/H08C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[258:259]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[284:285]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[295:298]
+            WEIGH= float(WEIGH)*0.45359237 if not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[329:330].strip()
+            HEIGHT_INCHES = line[330:338].strip()
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[128:129]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[149:150]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[255:256]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[256:257]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2008_two = pd.DataFrame(data)
+        result_2008 = pd.merge(result_2008_one, result_2008_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2008/data/H08B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[311:312]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[48:50]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2008_four = pd.DataFrame(data)
+        result_2008 = pd.merge(result_2008, result_2008_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2008], axis=0) 
+    # 获取2010数据
+    with(open("/root/r_base/HRS/2010/data/H10PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29]
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21]
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2010
+        result_2010_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2010/data/H10C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[250:251]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[276:277]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[287:290].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[304:305].strip()
+            HEIGHT_INCHES = line[305:313]
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[76:77]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[139:140]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[247:248]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[248:249]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2010_two = pd.DataFrame(data)
+        result_2010 = pd.merge(result_2010_one, result_2010_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2010/data/H10B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[305:306]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[48:50]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2010_four = pd.DataFrame(data)
+        result_2010 = pd.merge(result_2010, result_2010_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2010], axis=0) 
+    # 获取2012数据
+    with(open("/root/r_base/HRS/2012/data/H12PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21] if not line[20:21] == "" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2012
+        result_2012_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2012/data/H12C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[249:250]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[276:277]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[287:290].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[302:303].strip()
+            HEIGHT_INCHES = line[303:308]
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[82:83]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[146:147]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[246:247]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[247:248]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2012_two = pd.DataFrame(data)
+        result_2012 = pd.merge(result_2012_one, result_2012_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2012/data/H12B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[294:295]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[48:50]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2012_four = pd.DataFrame(data)
+        result_2012 = pd.merge(result_2012, result_2012_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2012], axis=0) 
+    # 获取2014数据
+    with(open("/root/r_base/HRS/2014/data/H14PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21] if not line[20:21] == "" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2014
+        result_2014_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2014/data/H14C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[214:215]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[239:240]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[250:253].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[259:260].strip()
+            HEIGHT_INCHES = line[260:265]
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[66:67]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[122:123]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[211:212]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[212:213]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2014_two = pd.DataFrame(data)
+        result_2014 = pd.merge(result_2014_one, result_2014_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2014/data/H14B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[274:275]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[39:41]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2014_four = pd.DataFrame(data)
+        result_2014 = pd.merge(result_2014, result_2014_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2014], axis=0) 
+    # 获取2016数据
+    with(open("/root/r_base/HRS/2016/data/H16PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21] if not line[20:21] == "" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2016
+        result_2016_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2016/data/H16C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[237:238]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[262:263]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[273:276].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan
+            # 身高
+            HEIGHT_FEET = line[282:283].strip()
+            HEIGHT_INCHES = line[283:288]
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # 1. YES
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[67:68]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5"
+            # 1. YES
+            # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)
+            # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION
+            # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[123:124]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[234:235]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[235:236]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2016_two = pd.DataFrame(data)
+        result_2016 = pd.merge(result_2016_one, result_2016_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2016/data/H16B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[275:276]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[39:41]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2016_four = pd.DataFrame(data)
+        result_2016 = pd.merge(result_2016, result_2016_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2016], axis=0) 
+    # 获取2018数据
+    with(open("/root/r_base/HRS/2018/data/H18PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan
+            # 1.Male 
+            # 2.Female
+            SEX = line[20:21] if not line[20:21] == "" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2018
+        result_2018_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2018/data/H18C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[381:382]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[404:406]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[418:421].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan
+            # 身高
+            HEIGHT_FEET = line[428:430].strip()
+            HEIGHT_INCHES = line[430:435].strip()
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES == "99" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # -8.  Web non-response
+            #  1.  YES
+            #  4.  [NEVER HAD HEART PROBLEM]
+            #  5.  NO
+            #  6.  [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT
+            #      TAKING MEDICATION FOR IT]
+            #  8.  DK (Don't Know); NA (Not Ascertained)
+            #  9.  RF (Refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[86:88]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5"
+            # -8.  Web non-response
+            #  1.  YES
+            #  2.  [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC
+            #      ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)]
+            #  4.  [NEVER HAD A STROKE]
+            #  5.  NO
+            #  8.  DK (Don't Know); NA (Not Ascertained)
+            #  9.  RF (Refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[162:164]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[367:369]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[369:371]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2018_two = pd.DataFrame(data)
+        result_2018 = pd.merge(result_2018_one, result_2018_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2018/data/H18B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[287:288]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[41:43]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2018_four = pd.DataFrame(data)
+        result_2018 = pd.merge(result_2018, result_2018_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2018], axis=0) 
+    # 获取2020数据
+    with(open("/root/r_base/HRS/2020/data/H20PR_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        BIRTH_YEAR_list = []
+        SEX_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            BIRTH_YEAR = line[36:40] if not line[36:40] == "" else np.nan
+            # 1.Male 
+            # 2.Female
+            SEX = line[33:34] if not line[33:34] == "" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            BIRTH_YEAR_list.append(BIRTH_YEAR)
+            SEX_list.append(SEX)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "BIRTH_YEAR":BIRTH_YEAR_list,
+            "SEX":SEX_list,
+        }
+        data["WAVE"] = 2020
+        result_2020_one = pd.DataFrame(data)
+    with(open("/root/r_base/HRS/2020/data/H20C_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        SMOKED_list = []
+        DRINKED_list = []
+        BMI_list = []
+        HEART_PROBL_list = []
+        PHYSICAL_ACTIVITY_LEVEL_list = []
+        STROKE_list=[]    
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. YES
+            # 5. NO
+            SMOKED = line[339:340]
+            SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan
+            # 1. YES
+            # 3. [VOL] NEVER HAVE USED ALCOHOL
+            # 5. NO
+            # 8. DK (don't know); NA (not ascertained)
+            # 9. RF (refused)
+            # 合并后
+            # 1.  Yes
+            # 5.  No [Inap in V502-V505]
+            DRINKED = line[363:365]
+            DRINKED = "1" if DRINKED == "1"  else "5" if DRINKED == "5" or DRINKED == "3" else np.nan
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 体重
+            WEIGH = line[380:383].strip()
+            WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan
+            # 身高
+            HEIGHT_FEET = line[389:390].strip()
+            HEIGHT_INCHES = line[390:395].strip()
+            if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH):
+                if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="":
+                    HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+                else:
+                    HEIGHT = float(HEIGHT_FEET)*0.3048
+                    # BMI
+                    BMI = WEIGH / math.pow(HEIGHT,2)
+            else :
+                BMI = np.nan
+            # -8.  Web non-response
+            #  1.  YES
+            #  4.  [NEVER HAD HEART PROBLEM]
+            #  5.  NO
+            #  6.  [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT
+            #      TAKING MEDICATION FOR IT]
+            #  8.  DK (Don't Know); NA (Not Ascertained)
+            #  9.  RF (Refused)
+            # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            HEART_PROBL = line[73:75]
+            HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5"
+            # -8.  Web non-response
+            #  1.  YES
+            #  2.  [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC
+            #      ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)]
+            #  4.  [NEVER HAD A STROKE]
+            #  5.  NO
+            #  8.  DK (Don't Know); NA (Not Ascertained)
+            #  9.  RF (Refused)
+            # STROKE 
+            #   1.      Yes
+            #   5.      No [Inap in V418-V421]
+            STROKE = line[138:140]
+            STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1"
+            # 1. MORE THAN ONCE A WEEK
+            # 2. ONCE A WEEK
+            # 3. ONCE TO THREE TIMES A MONTH
+            # 4. HARDLY EVER OR NEVER
+            # 7. (VOL) EVERY DAY
+            # 8. DK (Don't Know)
+            # 9. RF (Refused)
+            # 3. vigorous (vigorous activity more than once a week)
+            # 2. moderate (moderate activity more than once a week)
+            # 1. inactive (the rest)
+            # 重度活动
+            VIGOROUS_PHYSICAL = line[325:327]
+            VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan
+            #中度活动
+            MODERATE_PHYSICAL = line[327:329]
+            MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan
+            # 轻度活动
+            LIGHT_PHYSICAL = np.nan
+            PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL)            
+            SMOKED_list.append(SMOKED)
+            DRINKED_list.append(DRINKED)
+            BMI_list.append(BMI)
+            HEART_PROBL_list.append(HEART_PROBL)
+            STROKE_list.append(STROKE)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "SMOKED":SMOKED_list,
+            "DRINKED":DRINKED_list,
+            "BMI":BMI_list,
+            "HEART_PROBL":HEART_PROBL_list,
+            "STROKE":STROKE_list,
+            "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list
+        }
+        result_2020_two = pd.DataFrame(data)
+        result_2020 = pd.merge(result_2020_one, result_2020_two, on=["HHID", "PN"], how="left")
+    with(open("/root/r_base/HRS/2020/data/H20B_R.da", "r", encoding="utf-8") )as file:
+        HHID_list = []
+        PN_list = []
+        MARITAL_STATUS_list = []
+        EDUCATION_list = []
+        # 逐行读取文件
+        for line in file:
+            HHID = line[0:6]
+            PN = line[6:9]
+            # 1. MARRIED (VOL)
+            # 2. ANULLED (VOL)
+            # 3. SEPARATED
+            # 4. DIVORCED
+            # 5. WIDOWED
+            # 6. NEVER MARRIED
+            # 7. OTHER (SPECIFY)
+            # 8.  DK (Don't Know); NA (Not Ascertained)
+            # 9.  RF (Refused)
+            MARITAL_STATUS = line[304:305]
+            # 1 Married or Partner; 5 other
+            MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan
+            # 0 For no formal education 
+            # 1-11 .....Grades 
+            # 12 .......High school 
+            # 13-15 ....Some college
+            # 16 .......College grad
+            # 17 .......Post college     (17+ years)
+            # 97 .......Other
+            # 98. DK (don't know); NA (not ascertained)
+            # 99. RF (refused)
+            EDUCATION = line[40:42]
+            EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION
+            HHID_list.append(HHID)
+            PN_list.append(PN)
+            MARITAL_STATUS_list.append(MARITAL_STATUS)
+            EDUCATION_list.append(EDUCATION)
+        data = {
+            "HHID":HHID_list,
+            "PN":PN_list,
+            "MARITAL_STATUS":MARITAL_STATUS_list,
+            "EDUCATION":EDUCATION_list
+        }
+        result_2020_four = pd.DataFrame(data)
+        result_2020 = pd.merge(result_2020, result_2020_four, on=["HHID", "PN"], how="left")
+    result = pd.concat([result, result_2020], axis=0) 
+    result.to_csv("/root/r_base/HRS/result_all.csv", index=False)

+ 3 - 0
MR相关工作.md → Medical相关工作.md

@@ -203,3 +203,6 @@ HyenaDNA: Long-Range Genomic Sequence Modeling at Single Nucleotide Resolution
    - I9_CHD	Major coronary heart disease event	IX Diseases of the circulatory system (I9_)	51098 cases	402635 controls
    - I9_CORATHER	Coronary atherosclerosis	IX Diseases of the circulatory system (I9_)	56685 cases 378019 controls
 5. 蛋白相关数据来自冰岛的35,559 Icelanders,地址:www.decode.com/summarydata/,相关论文"Large-scale integration of the plasma proteome with genetics and disease"
+
+# 工作 9.4
+1. HRS数据一共分几次加入92、 AHEAD 1993-94, 1995-96, 1998、 2004

+ 77 - 0
test.py

@@ -0,0 +1,77 @@
+import pandas as pd
+
+# data = pd.read_csv("CLHLS/clhls_1998_2018_result.csv")
+# print(data.shape)
+
+
+# data = pd.read_csv("HRS/result_all.csv")
+# print(data.shape)
+# # 去重并统计ID个数
+# unique_ids = data.drop_duplicates(subset=["HHID", "PN"])
+# count_unique_ids = unique_ids.count()
+# print(count_unique_ids)
+
+# data = pd.read_csv("/root/r_base/UKDA-5050-stata/result_all.csv")
+# print(data.shape)
+# # 去重并统计ID个数
+# unique_ids = data.drop_duplicates(subset=["id"])
+# count_unique_ids = unique_ids.count()
+# print(count_unique_ids)
+
+# df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
+# print(df.shape)
+# df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_pensiongrid_eul_v2.dta', convert_categoricals=False)
+# print(df.shape)
+# df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_financial_derived_variables.dta', convert_categoricals=False)
+# print(df.shape)
+# df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
+# print(df.shape)
+
+# 指定文件夹路径
+# import glob
+# import os
+# folder_path = '/root/r_base/NHANES/2017-2018'
+
+# # 获取所有 .xpt 文件的路径
+# xpt_files = glob.glob(os.path.join(folder_path, '*.XPT'))
+
+# num = 0
+# # 读取并处理每一个 .xpt 文件
+# for file_path in xpt_files:
+#     try:
+#         # 使用 pandas 读取 .xpt 文件
+#         df = pd.read_sas(file_path, format='xport')
+#         # 输出数据框的前几行以进行检查
+#         print(f"Data from {file_path}:")
+#         print(df.shape)
+#         num += df.shape[1]
+#     except Exception as e:
+#         print(f"Error reading {file_path}: {e}")
+# print(num)
+
+# data = pd.read_csv("/root/r_base/CHARLS/result_all.csv")
+# print(data.shape)
+# # 去重并统计ID个数
+# unique_ids = data.drop_duplicates(subset=["householdID"])
+# count_unique_ids = unique_ids.count()
+# print(count_unique_ids)
+# 指定文件夹路径
+# import glob
+# import os
+# folder_path = '/root/r_base/CHARLS/CHARLS2018'
+
+# # 获取所有 .xpt 文件的路径
+# xpt_files = glob.glob(os.path.join(folder_path, '*.dta'))
+# num = 0
+# # 读取并处理每一个 .xpt 文件
+# for file_path in xpt_files:
+#     try:
+#         # 使用 pandas 读取 .xpt 文件
+#         df = pd.read_stata(file_path)
+#         # 输出数据框的前几行以进行检查
+#         print(f"Data from {file_path}:")
+#         print(df.shape)
+#         num += df.shape[1]
+#     except Exception as e:
+#         print(f"Error reading {file_path}: {e}")
+# print(num)