import pandas as pd import math import numpy as np if __name__ == "__main__": # age # sex # marital status # education # smoking status # drinking status # physical activity level # body mass index (BMI) # glycated haemoglobin (HbA1c) # systolic blood pressure (SBP) # high-density lipoprotein cholesterol (HDL-C) # C-reactive protein # 获取1992数据 with(open("/root/r_base/HRS/1992/data/health.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[249:254] # 1.Male # 2.Female SEX = line[109:110] # 1. Married [Inap in V228-V238] # 2. Partner [Inap in V226-V227] # 3. Separated [Inap in V226-V234] # 4. Divorced [Inap in V226-V234] # 5. Widowed [Inap in V226-V234] # 6. Never married # 7. Married with 2 family residences--both # sampleable # 8. Married with 2 family residences--one # residence is not sampleable (institution # or out of the country) # 9. NA MARITAL_STATUS = line[302:303] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other EDUCATION = line[264:266] # 1. Yes # 5. No [Inap in V502-V505] SMOKED = line[519:520] # 1. Yes # 5. No [Inap in V507] DRINKED = line[527:528] # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[534:535] # 轻度活动 LIGHT_PHYSICAL = line[533:534] PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL =="2" else 2 if LIGHT_PHYSICAL=="1" or LIGHT_PHYSICAL=="2" else 1 # 体重 WEIGH= float(line[536:539])*0.45359237 # 身高 HIGHT = float(line[542:543])*0.3048 + float(line[543:545])*0.0254 # BMI BMI = WEIGH / math.pow(HIGHT,2) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[459:460] # STROKE STROKE = line[473:474] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } data["WAVE"] = 1992 result = pd.DataFrame(data) # 获取1993数据 with(open("/root/r_base/HRS/1993/data/BR21.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[61:65] # 1.Male # 2.Female SEX = line[16:17] # MARRIED, SPOUSE PRESENT........... 1 # MARRIED, SPOUSE ABSENT............ 2 # LIVING WITH SOMEONE............... 3 GO TO A11b # DIVORCED/SEPARATED................ 4 GO TO A11g # WIDOWED........................... 5 GO TO A11g # NEVER MARRIED..................... 6 GO TO B1 MARITAL_STATUS = line[98:99] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5" # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other EDUCATION = line[74:76] #CURRENT SMOKER..................... 1 # FORMER SMOKER...................... 2 GO TO B20 # NEVER SMOKED....................... 3 GO TO B20 # 1. Yes # 5. No [Inap in V502-V505] SMOKED = "1" if line[172:173] == "1" or line[172:173] == "2" else "5" # 1. Yes # 5. No [Inap in V507] DRINKED = line[176:177] # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = np.nan # 体重 WEIGH= float(line[179:182])*0.45359237 # 身高 HEIGHT = float(line[182:184])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[139:140] # STROKE STROKE = line[142:143] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } data["WAVE"] = 1993 result_1993 = pd.DataFrame(data) result = pd.concat([result, result_1993], axis=0) # 获取1994数据 with(open("/root/r_base/HRS/1994/data/W2a.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[26:30] # 1.Male # 2.Female SEX = line[22:23] # 1. Married [Inap in V228-V238] # 2. Partner [Inap in V226-V227] # 3. Separated [Inap in V226-V234] # 4. Divorced [Inap in V226-V234] # 5. Widowed [Inap in V226-V234] # 6. Never married # 7. Married (Not Institutionalized/not out of country) # 8. Married (Institutionalized/out of country) # 9. NA MARITAL_STATUS = line[55:57] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="7" or MARITAL_STATUS=="8" else "5" if not MARITAL_STATUS=="9" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. Don't Know; DK # 99. Not Ascertained; NA EDUCATION = line[112:115] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, } data["WAVE"] = 1994 result_1994_one = pd.DataFrame(data) with(open("/root/r_base/HRS/1994/data/W2B.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] #1. Yes #5. No [GO TO B41] #8. Don't Know; DK [GO TO B41] #9. Refused; RF [GO TO B41] #0. Inap. # Proxy interview for deceased Wave-1 R SMOKED = line[356:358] #1. Yes #5. No [GO TO B41] #8. Don't Know; DK [GO TO B41] #9. Refused; RF [GO TO B41] #0. Inap. # Proxy interview for deceased Wave-1 R DRINKED = line[367:369] # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 活动单位 # 02. Week # 04. Month # 06. Year # 07. Other (specify) # 11. Day # 98. Don't Know/Not Ascertained; DK/NA # 99. Refused; RF # 00. Inap. # Proxy interview for deceased Wave-1 R # [B42a: or B42=995-999] # [B43a: or B43=995-999] # 重度活动 VIGOROUS_PHYSICAL = line[378:382] VIGOROUS_UNIT = line[382:385] VIGOROUS_PHYSICAL_FLAG = np.nan if VIGOROUS_UNIT == "02" and float(VIGOROUS_PHYSICAL)>0 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995: VIGOROUS_PHYSICAL_FLAG = True if VIGOROUS_UNIT == "04" and float(VIGOROUS_PHYSICAL)>3 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995: VIGOROUS_PHYSICAL_FLAG = True if VIGOROUS_UNIT == "06" and float(VIGOROUS_PHYSICAL)>51 and not VIGOROUS_PHYSICAL=="994" and not float(VIGOROUS_PHYSICAL)>995: VIGOROUS_PHYSICAL_FLAG = True if VIGOROUS_PHYSICAL=="00": VIGOROUS_PHYSICAL_FLAG = False # 轻度活动 LIGHT_PHYSICAL = line[371:375] LIGHT_UNIT = line[375:378] # 判断是否符合轻运动 1符合;0不符合 LIGHT_PHYSICAL_FLAG = np.nan if LIGHT_UNIT == "02" and float(LIGHT_PHYSICAL)>0 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995: LIGHT_PHYSICAL_FLAG = True if LIGHT_UNIT == "04" and float(LIGHT_PHYSICAL)>3 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995: LIGHT_PHYSICAL_FLAG = True if LIGHT_UNIT == "06" and float(LIGHT_PHYSICAL)>51 and not LIGHT_PHYSICAL=="994" and not float(LIGHT_PHYSICAL)>995: LIGHT_PHYSICAL_FLAG = True if LIGHT_PHYSICAL=="00": LIGHT_PHYSICAL_FLAG = False PHYSICAL_ACTIVITY_LEVEL = 3 if VIGOROUS_PHYSICAL_FLAG == True else 2 if LIGHT_PHYSICAL_FLAG==True else 1 if LIGHT_PHYSICAL_FLAG==False or VIGOROUS_PHYSICAL_FLAG==False else np.nan # 体重 WEIGH= float(line[385:389])*0.45359237 if not float(line[385:389])>500 else np.nan # 身高 HIGHT = float(line[389:392])*0.3048 + float(line[392:395])*0.0254 if not float(line[389:392])>95 and not float(line[392:395])>95 else np.nan # BMI BMI = WEIGH / math.pow(HIGHT,2) if not np.isnan(WEIGH) and not np.isnan(HIGHT) else np.nan # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[147:149] if not line[147:149]=="8" and not line[147:149]=="9" and not line[147:149]=="0" else np.nan # STROKE STROKE = line[173:175] if not line[173:175]=="8" and not line[173:175]=="9" and not line[173:175]=="0" else np.nan HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_1994_two = pd.DataFrame(data) result_1994 = pd.merge(result_1994_one, result_1994_two, on=["HHID", "PN"], how="left") result = pd.concat([result, result_1994], axis=0) print(result.head()) # 获取1995数据 with(open("/root/r_base/HRS/1995/data/A95A_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[30:34] # MARRIED, SPOUSE PRESENT........... 1 # MARRIED, SPOUSE ABSENT............ 2 # LIVING WITH SOMEONE............... 3 GO TO A11b # DIVORCED/SEPARATED................ 4 GO TO A11g # WIDOWED........................... 5 GO TO A11g # NEVER MARRIED..................... 6 GO TO B1 # 0. Exit proxy was taken before the interview with the surviving spouse. # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION) MARITAL_STATUS = line[76:77] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else np.nan if MARITAL_STATUS=="0" or MARITAL_STATUS=="7" else "5" # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[49:51] == "97" or line[49:51] == "98" or line[49:51] == "99" else line[49:51] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, } data["WAVE"] = 1995 result_1995_one = pd.DataFrame(data) with(open("/root/r_base/HRS/1995/data/A95CS_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1.Male # 2.Female SEX = line[36:37] if not line[36:37]=="0" else np.nan HHID_list.append(HHID) PN_list.append(PN) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "SEX":SEX_list, } result_1995_two = pd.DataFrame(data) result_1995 = pd.merge(result_1995_one, result_1995_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/1995/data/A95B_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 2. Cigars # 5. NO # 7. Other # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] SMOKED = "1" if line[153:154] == "1" or line[153:154] == "2" else "5" if line[153:154] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 7. Other # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[157:158] == "1" else "5" if line[157:158] == "5" or line[157:158] == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = np.nan # 体重 WEIGH= float(line[164:167])*0.45359237 if not float(line[164:167])>400 else np.nan # 身高 HIGHT = line[168:169] if not line[168:169] == " " and not line[168:169] == "8" and not line[168:169] == "9" and not pd.isna(WEIGH): HIGHT = float(line[168:169])*0.3048 + float(line[169:171])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. [VOL] DISPUTES W1 RECORD # 5. NO # 7. Other # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[63:64] == "8" else line[63:64] # STROKE # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] STROKE = np.nan if line[84:85] == "8" else "5" if line[84:85] == "2" or line[84:85] == "5" else line[84:85] HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_1995_three = pd.DataFrame(data) result_1995 = pd.merge(result_1995, result_1995_three, on=["HHID", "PN"], how="left") result = pd.concat([result, result_1995], axis=0) # 获取1996数据 with(open("/root/r_base/HRS/1996/data/H96A_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] # MARRIED, SPOUSE PRESENT........... 1 # MARRIED, SPOUSE ABSENT............ 2 # LIVING WITH SOMEONE............... 3 GO TO A11b # DIVORCED/SEPARATED................ 4 GO TO A11g # WIDOWED........................... 5 GO TO A11g # NEVER MARRIED..................... 6 GO TO B1 # 7. MARRIED, SPOUSE ABSENT (NOT INSTITUTION) MARITAL_STATUS = line[70:71] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" or MARITAL_STATUS=="2" or MARITAL_STATUS=="3" else "5" if MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[43:45] == "97" or line[43:45] == "98" or line[43:45] == "99" else line[43:45] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, } data["WAVE"] = 1996 result_1996_one = pd.DataFrame(data) with(open("/root/r_base/HRS/1996/data/H96CS_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1.Male # 2.Female SEX = line[74:75] HHID_list.append(HHID) PN_list.append(PN) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "SEX":SEX_list, } result_1996_two = pd.DataFrame(data) result_1996 = pd.merge(result_1996_one, result_1996_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/1996/data/H96B_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 2. Cigars # 3. PIPE (IF VOLUNTEERED) # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] SMOKED = "1" if line[160:161] == "1" or line[160:161] == "2" or line[160:161] == "3" else "5" if line[160:161] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[166:167] == "1" else "5" if line[166:167] == "5" or line[166:167] == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = np.nan # 体重 WEIGH= float(line[174:177])*0.45359237 if not line[174:177].strip() =="" and not float(line[174:177])>400 else np.nan # 身高 HIGHT = line[178:180].strip() if not HIGHT == "" and not HIGHT == "98" and not HIGHT == "99" and not pd.isna(WEIGH): HIGHT = float(HIGHT)*0.3048 + float(line[180:182])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. [VOL] DISPUTES W1 RECORD # 5. NO # 7. Other # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[66:67] == "9" or line[66:67] == "" else line[66:67] # STROKE # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88] HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_1996_three = pd.DataFrame(data) result_1996 = pd.merge(result_1996, result_1996_three, on=["HHID", "PN"], how="left") result = pd.concat([result, result_1996], axis=0) # 获取1998数据 with(open("/root/r_base/HRS/1998/data/H98A_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[31:35] if not line[31:35] == "9998" else np.nan # 0. DK (don't know); NA (not ascertained); RF (refused) # 1. MARRIED # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) MARITAL_STATUS = line[150:151] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[61:62] == "97" or line[61:62] == "98" or line[61:62] == "99" else line[61:62] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, } data["WAVE"] = 1998 result_1998_one = pd.DataFrame(data) with(open("/root/r_base/HRS/1998/data/H98CS_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1.Male # 2.Female SEX = line[70:71] HHID_list.append(HHID) PN_list.append(PN) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "SEX":SEX_list, } result_1998_two = pd.DataFrame(data) result_1998 = pd.merge(result_1998_one, result_1998_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/1998/data/H98B_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[157:158] if line[157:158] == "1" or line[157:158] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[184:185] == "1" else "5" if line[184:185] == "5" or line[184:185] == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = np.nan # 体重 WEIGH= float(line[196:199])*0.45359237 if not float(line[196:199])>400 else np.nan # 身高 HEIGHT = line[200:202].strip() if not HEIGHT == "" and not HEIGHT == "98" and not HEIGHT == "99" and not pd.isna(WEIGH): if not line[202:204] == "98": HEIGHT = float(HEIGHT)*0.3048 + float(line[202:204])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(line[200:202])*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD # 5. NO # 6. PRELOAD ERROR: Condition reported at prior wave but said no to # new event # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[66:67] == "8" else line[66:67] # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. [VOL] DISPUTES PREVIOUS WAVE RECORD # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 3. [VOL] DISPUTES W1 RECORD # 5. No [Inap in V418-V421] STROKE = np.nan if line[87:88] == "8" else "5" if line[87:88] == "2" or line[87:88] == "5" else line[87:88] HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_1998_three = pd.DataFrame(data) result_1998 = pd.merge(result_1998, result_1998_three, on=["HHID", "PN"], how="left") result = pd.concat([result, result_1998], axis=0) # 获取2000数据 with(open("/root/r_base/HRS/2000/data/H00A_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[31:35] # 0. DK (don't know); NA (not ascertained); RF (refused) # 1. MARRIED # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) MARITAL_STATUS = line[152:153] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[63:65] == "97" or line[63:65] == "98" or line[63:65] == "99" else line[63:65] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list, } data["WAVE"] = 2000 result_2000_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2000/data/H00CS_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1.Male # 2.Female SEX = line[79:80] HHID_list.append(HHID) PN_list.append(PN) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "SEX":SEX_list, } result_2000_two = pd.DataFrame(data) result_2000 = pd.merge(result_2000_one, result_2000_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2000/data/H00B_R.DA", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[154:155] if line[154:155] == "1" or line[154:155] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[181:182] == "1" else "5" if line[181:182] == "5" or line[181:182] == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = np.nan # 体重 WEIGH= float(line[193:196])*0.45359237 if not float(line[193:196])>400 else np.nan # 身高 HEIGHT = line[197:198].strip() if not HEIGHT == "" and not HEIGHT == "9" and not pd.isna(WEIGH): HEIGHT = float(HEIGHT)*0.3048 + float(line[198:200])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[65:66] == "8" or line[65:66] == "9" else "1" if line[65:66] == "1" or line[65:66] == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = np.nan if line[86:87] == "8" or line[86:87] == "9" else "5" if line[86:87] == "2" or line[86:87] == "4" or line[86:87] == "5" else "1" HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_2000_three = pd.DataFrame(data) result_2000 = pd.merge(result_2000, result_2000_three, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2000], axis=0) # 获取2002数据 with(open("/root/r_base/HRS/2002/data/H02PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[36:40] # 1.Male # 2.Female SEX = line[19:20] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2002 result_2002_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2002/data/H02C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[155:156] if line[155:156] == "1" or line[155:156] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[179:180] == "1" else "5" if line[179:180] == "5" or line[179:180] == "3" else np.nan # 体重 WEIGH= float(line[190:193])*0.45359237 if not float(line[190:193])>400 else np.nan # 身高 HEIGHT = line[194:195].strip() if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH): if not line[195:197] == "98" and not line[195:197]=="": HEIGHT = float(HEIGHT)*0.3048 + float(line[195:197])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(line[194:195])*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[59:60] == "8" or line[59:60] == "9" else "1" if line[59:60] == "1" or line[59:60] == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = np.nan if line[80:81] == "8" or line[80:81] == "9" else "5" if line[80:81] == "2" or line[80:81] == "4" or line[80:81] == "5" else "1" HHID_list.append(HHID) PN_list.append(PN) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list } result_2002_two = pd.DataFrame(data) result_2002 = pd.merge(result_2002_one, result_2002_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2002/data/H02V_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = "3" if line[151:152] == "1" or line[151:152] == "2" else "1" if line[151:152] == "3" or line[151:152] == "4" else np.nan #中度活动 MODERATE_PHYSICAL = "2" if line[152:153] == "1" or line[152:153] == "2" else "1" if line[152:153] == "3" or line[152:153] == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) data = { "HHID":HHID_list, "PN":PN_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2002_three = pd.DataFrame(data) result_2002 = pd.merge(result_2002, result_2002_three, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2002/data/H02B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. Married # 3. Separated # 4. Divorced # 5. Widowed # 6. Never Married # 7. Other (Specify) # 8. DK (Don't Know) # 9. RF (Refused) MARITAL_STATUS = line[131:132] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[42:44] == "97" or line[42:44] == "98" or line[42:44] == "99" else line[42:44] HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2002_four = pd.DataFrame(data) result_2002 = pd.merge(result_2002, result_2002_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2002], axis=0) # 获取2004数据 with(open("/root/r_base/HRS/2004/data/H04PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] # 1.Male # 2.Female SEX = line[20:21] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2004 result_2004_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2004/data/H04C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[167:168] if line[167:168] == "1" or line[167:168] == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = "1" if line[192:193] == "1" else "5" if line[192:193] == "5" or line[192:193] == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH= float(line[203:206])*0.45359237 if not float(line[203:206])>400 else np.nan # 身高 HEIGHT = line[222:223].strip() if not HEIGHT == "" and not HEIGHT == "8" and not pd.isna(WEIGH): if not line[223:225] == "98" and not line[223:225]=="": HEIGHT = float(HEIGHT)*0.3048 + float(line[223:225])*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(line[222:223])*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = np.nan if line[69:70] == "8" or line[69:70] == "9" else "1" if line[69:70] == "1" or line[69:70] == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = np.nan if line[90:91] == "8" or line[90:91] == "9" else "5" if line[90:91] == "2" or line[90:91] == "4" or line[90:91] == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = "3" if line[164:165] == "1" or line[164:165] == "2" or line[164:165] == "7" else "1" if line[164:165] == "3" or line[164:165] == "4" else np.nan #中度活动 MODERATE_PHYSICAL = "2" if line[165:166] == "1" or line[165:166] == "2" or line[165:166] == "7" else "1" if line[165:166] == "3" or line[165:166] == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2004_two = pd.DataFrame(data) result_2004 = pd.merge(result_2004_one, result_2004_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2004/data/H04B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) MARITAL_STATUS = line[161:162] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = np.nan if line[41:43] == "97" or line[41:43] == "98" or line[41:43] == "99" else line[41:43] HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2004_four = pd.DataFrame(data) result_2004 = pd.merge(result_2004, result_2004_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2004], axis=0) # 获取2006数据 with(open("/root/r_base/HRS/2006/data/H06PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] # 1.Male # 2.Female SEX = line[20:21] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2006 result_2006_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2006/data/H06C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[181:182] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[207:208] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[218:221].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[252:253] HEIGHT_INCHES = line[253:255] if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[80:81] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[101:102] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[178:179] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[179:180] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2006_two = pd.DataFrame(data) result_2006 = pd.merge(result_2006_one, result_2006_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2006/data/H06B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[177:178] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[47:49] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2006_four = pd.DataFrame(data) result_2006 = pd.merge(result_2006, result_2006_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2006], axis=0) # 获取2008数据 with(open("/root/r_base/HRS/2008/data/H08PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] # 1.Male # 2.Female SEX = line[20:21] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2008 result_2008_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2008/data/H08C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[258:259] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[284:285] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[295:298] WEIGH= float(WEIGH)*0.45359237 if not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[329:330].strip() HEIGHT_INCHES = line[330:338].strip() if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[128:129] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[149:150] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[255:256] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[256:257] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2008_two = pd.DataFrame(data) result_2008 = pd.merge(result_2008_one, result_2008_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2008/data/H08B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[311:312] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[48:50] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2008_four = pd.DataFrame(data) result_2008 = pd.merge(result_2008, result_2008_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2008], axis=0) # 获取2010数据 with(open("/root/r_base/HRS/2010/data/H10PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] # 1.Male # 2.Female SEX = line[20:21] HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2010 result_2010_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2010/data/H10C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[250:251] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[276:277] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[287:290].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[304:305].strip() HEIGHT_INCHES = line[305:313] if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "99998" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[76:77] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[139:140] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[247:248] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[248:249] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2010_two = pd.DataFrame(data) result_2010 = pd.merge(result_2010_one, result_2010_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2010/data/H10B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[305:306] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[48:50] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2010_four = pd.DataFrame(data) result_2010 = pd.merge(result_2010, result_2010_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2010], axis=0) # 获取2012数据 with(open("/root/r_base/HRS/2012/data/H12PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan # 1.Male # 2.Female SEX = line[20:21] if not line[20:21] == "" else np.nan HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2012 result_2012_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2012/data/H12C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[249:250] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[276:277] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[287:290].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[302:303].strip() HEIGHT_INCHES = line[303:308] if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[82:83] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[146:147] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[246:247] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[247:248] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2012_two = pd.DataFrame(data) result_2012 = pd.merge(result_2012_one, result_2012_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2012/data/H12B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[294:295] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[48:50] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2012_four = pd.DataFrame(data) result_2012 = pd.merge(result_2012, result_2012_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2012], axis=0) # 获取2014数据 with(open("/root/r_base/HRS/2014/data/H14PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan # 1.Male # 2.Female SEX = line[20:21] if not line[20:21] == "" else np.nan HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2014 result_2014_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2014/data/H14C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[214:215] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[239:240] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[250:253].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[259:260].strip() HEIGHT_INCHES = line[260:265] if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[66:67] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[122:123] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[211:212] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[212:213] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2014_two = pd.DataFrame(data) result_2014 = pd.merge(result_2014_one, result_2014_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2014/data/H14B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[274:275] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[39:41] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2014_four = pd.DataFrame(data) result_2014 = pd.merge(result_2014, result_2014_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2014], axis=0) # 获取2016数据 with(open("/root/r_base/HRS/2016/data/H16PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan # 1.Male # 2.Female SEX = line[20:21] if not line[20:21] == "" else np.nan HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2016 result_2016_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2016/data/H16C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[237:238] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[262:263] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[273:276].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 else np.nan # 身高 HEIGHT_FEET = line[282:283].strip() HEIGHT_INCHES = line[283:288] if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # 1. YES # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[67:68] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" else "1" if HEART_PROBL == "1" or HEART_PROBL == "3" else "5" # 1. YES # 2. [VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK) # 3. DISPUTES PREVIOUS WAVE RECORD, BUT NOW HAS CONDITION # 4. DISPUTES PREVIOUS WAVE RECORD, DOES NOT HAVE CONDITION # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[123:124] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[234:235] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[235:236] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2016_two = pd.DataFrame(data) result_2016 = pd.merge(result_2016_one, result_2016_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2016/data/H16B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[275:276] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[39:41] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2016_four = pd.DataFrame(data) result_2016 = pd.merge(result_2016, result_2016_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2016], axis=0) # 获取2018数据 with(open("/root/r_base/HRS/2018/data/H18PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[25:29] if not line[25:29] == "" else np.nan # 1.Male # 2.Female SEX = line[20:21] if not line[20:21] == "" else np.nan HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2018 result_2018_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2018/data/H18C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[381:382] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[404:406] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[418:421].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan # 身高 HEIGHT_FEET = line[428:430].strip() HEIGHT_INCHES = line[430:435].strip() if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES == "99" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # -8. Web non-response # 1. YES # 4. [NEVER HAD HEART PROBLEM] # 5. NO # 6. [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT # TAKING MEDICATION FOR IT] # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[86:88] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5" # -8. Web non-response # 1. YES # 2. [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC # ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)] # 4. [NEVER HAD A STROKE] # 5. NO # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[162:164] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[367:369] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[369:371] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2018_two = pd.DataFrame(data) result_2018 = pd.merge(result_2018_one, result_2018_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2018/data/H18B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[287:288] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[41:43] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2018_four = pd.DataFrame(data) result_2018 = pd.merge(result_2018, result_2018_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2018], axis=0) # 获取2020数据 with(open("/root/r_base/HRS/2020/data/H20PR_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] BIRTH_YEAR_list = [] SEX_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] BIRTH_YEAR = line[36:40] if not line[36:40] == "" else np.nan # 1.Male # 2.Female SEX = line[33:34] if not line[33:34] == "" else np.nan HHID_list.append(HHID) PN_list.append(PN) BIRTH_YEAR_list.append(BIRTH_YEAR) SEX_list.append(SEX) data = { "HHID":HHID_list, "PN":PN_list, "BIRTH_YEAR":BIRTH_YEAR_list, "SEX":SEX_list, } data["WAVE"] = 2020 result_2020_one = pd.DataFrame(data) with(open("/root/r_base/HRS/2020/data/H20C_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] SMOKED_list = [] DRINKED_list = [] BMI_list = [] HEART_PROBL_list = [] PHYSICAL_ACTIVITY_LEVEL_list = [] STROKE_list=[] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. YES # 5. NO SMOKED = line[339:340] SMOKED = SMOKED if SMOKED == "1" or SMOKED == "5" else np.nan # 1. YES # 3. [VOL] NEVER HAVE USED ALCOHOL # 5. NO # 8. DK (don't know); NA (not ascertained) # 9. RF (refused) # 合并后 # 1. Yes # 5. No [Inap in V502-V505] DRINKED = line[363:365] DRINKED = "1" if DRINKED == "1" else "5" if DRINKED == "5" or DRINKED == "3" else np.nan # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 体重 WEIGH = line[380:383].strip() WEIGH= float(WEIGH)*0.45359237 if not WEIGH=="" and not float(WEIGH)>400 and not float(WEIGH)<0 else np.nan # 身高 HEIGHT_FEET = line[389:390].strip() HEIGHT_INCHES = line[390:395].strip() if not HEIGHT_FEET == "" and not HEIGHT_FEET == "8" and not HEIGHT_FEET == "9" and not pd.isna(WEIGH): if not HEIGHT_INCHES == "98" and not HEIGHT_INCHES=="": HEIGHT = float(HEIGHT_FEET)*0.3048 + float(HEIGHT_INCHES)*0.0254 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else: HEIGHT = float(HEIGHT_FEET)*0.3048 # BMI BMI = WEIGH / math.pow(HEIGHT,2) else : BMI = np.nan # -8. Web non-response # 1. YES # 4. [NEVER HAD HEART PROBLEM] # 5. NO # 6. [HAD HEART PROBLEM BEFORE, BUT DO NOT HAVE IT NOW AND AM NOT # TAKING MEDICATION FOR IT] # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) # A heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems # 1. Yes # 5. No [Inap in V418-V421] HEART_PROBL = line[73:75] HEART_PROBL = np.nan if HEART_PROBL == "8" or HEART_PROBL == "9" or HEART_PROBL == "" or HEART_PROBL == "-8" else "1" if HEART_PROBL == "1" else "5" # -8. Web non-response # 1. YES # 2. [[VOL] POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC # ATTACK)/POSSIBLE STROKE OR TIA (TRANSIENT ISCHEMIC ATTACK)] # 4. [NEVER HAD A STROKE] # 5. NO # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) # STROKE # 1. Yes # 5. No [Inap in V418-V421] STROKE = line[138:140] STROKE = np.nan if STROKE == "8" or STROKE == "9" or STROKE == "" or STROKE == "-8" else "5" if STROKE == "2" or STROKE == "4" or STROKE == "5" else "1" # 1. MORE THAN ONCE A WEEK # 2. ONCE A WEEK # 3. ONCE TO THREE TIMES A MONTH # 4. HARDLY EVER OR NEVER # 7. (VOL) EVERY DAY # 8. DK (Don't Know) # 9. RF (Refused) # 3. vigorous (vigorous activity more than once a week) # 2. moderate (moderate activity more than once a week) # 1. inactive (the rest) # 重度活动 VIGOROUS_PHYSICAL = line[325:327] VIGOROUS_PHYSICAL = "3" if VIGOROUS_PHYSICAL == "1" or VIGOROUS_PHYSICAL == "2" or VIGOROUS_PHYSICAL == "7" else "1" if VIGOROUS_PHYSICAL == "3" or VIGOROUS_PHYSICAL == "4" else np.nan #中度活动 MODERATE_PHYSICAL = line[327:329] MODERATE_PHYSICAL = "2" if MODERATE_PHYSICAL == "1" or MODERATE_PHYSICAL == "2" or MODERATE_PHYSICAL == "7" else "1" if MODERATE_PHYSICAL == "3" or MODERATE_PHYSICAL == "4" else np.nan # 轻度活动 LIGHT_PHYSICAL = np.nan PHYSICAL_ACTIVITY_LEVEL = "3" if VIGOROUS_PHYSICAL=="3" else "2" if MODERATE_PHYSICAL=="2" else "1" if VIGOROUS_PHYSICAL=="1" or MODERATE_PHYSICAL=="1" else np.nan HHID_list.append(HHID) PN_list.append(PN) PHYSICAL_ACTIVITY_LEVEL_list.append(PHYSICAL_ACTIVITY_LEVEL) SMOKED_list.append(SMOKED) DRINKED_list.append(DRINKED) BMI_list.append(BMI) HEART_PROBL_list.append(HEART_PROBL) STROKE_list.append(STROKE) data = { "HHID":HHID_list, "PN":PN_list, "SMOKED":SMOKED_list, "DRINKED":DRINKED_list, "BMI":BMI_list, "HEART_PROBL":HEART_PROBL_list, "STROKE":STROKE_list, "PHYSICAL_ACTIVITY_LEVEL":PHYSICAL_ACTIVITY_LEVEL_list } result_2020_two = pd.DataFrame(data) result_2020 = pd.merge(result_2020_one, result_2020_two, on=["HHID", "PN"], how="left") with(open("/root/r_base/HRS/2020/data/H20B_R.da", "r", encoding="utf-8") )as file: HHID_list = [] PN_list = [] MARITAL_STATUS_list = [] EDUCATION_list = [] # 逐行读取文件 for line in file: HHID = line[0:6] PN = line[6:9] # 1. MARRIED (VOL) # 2. ANULLED (VOL) # 3. SEPARATED # 4. DIVORCED # 5. WIDOWED # 6. NEVER MARRIED # 7. OTHER (SPECIFY) # 8. DK (Don't Know); NA (Not Ascertained) # 9. RF (Refused) MARITAL_STATUS = line[304:305] # 1 Married or Partner; 5 other MARITAL_STATUS = "1" if MARITAL_STATUS=="1" else "5" if MARITAL_STATUS=="2" or MARITAL_STATUS=="3" or MARITAL_STATUS=="4" or MARITAL_STATUS=="5" or MARITAL_STATUS=="6" or MARITAL_STATUS=="7" else np.nan # 0 For no formal education # 1-11 .....Grades # 12 .......High school # 13-15 ....Some college # 16 .......College grad # 17 .......Post college (17+ years) # 97 .......Other # 98. DK (don't know); NA (not ascertained) # 99. RF (refused) EDUCATION = line[40:42] EDUCATION = np.nan if EDUCATION == "97" or EDUCATION == "98" or EDUCATION == "99" or EDUCATION == "" else EDUCATION HHID_list.append(HHID) PN_list.append(PN) MARITAL_STATUS_list.append(MARITAL_STATUS) EDUCATION_list.append(EDUCATION) data = { "HHID":HHID_list, "PN":PN_list, "MARITAL_STATUS":MARITAL_STATUS_list, "EDUCATION":EDUCATION_list } result_2020_four = pd.DataFrame(data) result_2020 = pd.merge(result_2020, result_2020_four, on=["HHID", "PN"], how="left") result = pd.concat([result, result_2020], axis=0) result.to_csv("/root/r_base/HRS/result_all.csv", index=False)