|
@@ -1,10 +1,12 @@
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import pyreadstat
|
|
|
+from datetime import date
|
|
|
+from lunarcalendar import Converter, Lunar
|
|
|
|
|
|
#统一列名
|
|
|
def change_columns(df):
|
|
|
- df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
|
|
|
+ df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
|
|
|
"waist", "Systolic","Diastolic",
|
|
|
|
|
|
'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp',
|
|
@@ -18,7 +20,7 @@ def change_columns(df):
|
|
|
|
|
|
'Smoke','Drink',
|
|
|
|
|
|
- "Cognition_score", "Psychiatric_score","sleep_state", "wave"
|
|
|
+ "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
|
|
|
]
|
|
|
# 2020年把帕金森和记忆病症分开,需要和以前对齐
|
|
|
def process_row(row):
|
|
@@ -54,7 +56,7 @@ if __name__ == "__main__":
|
|
|
health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
|
|
|
health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
|
|
|
exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
|
|
|
-
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
|
|
|
#性别#年龄#居住地#婚姻状况
|
|
|
# 1 married or partnered
|
|
|
# 0 other marital status (separated, divorced, unmarried, or widowed)
|
|
@@ -65,8 +67,11 @@ if __name__ == "__main__":
|
|
|
# 1 high school
|
|
|
# 2 college or above
|
|
|
demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
|
|
|
- data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
|
|
|
+ data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
|
|
|
|
|
|
#居住地
|
|
|
data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
@@ -205,6 +210,16 @@ if __name__ == "__main__":
|
|
|
# (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
data_2011["sleep_state"] = health_status['dc015']
|
|
|
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
data_2011["wave"] = year
|
|
|
change_columns(data_2011)
|
|
|
# 2011年的ID和其他年份有一点区别,倒数第三位加0
|
|
@@ -218,6 +233,7 @@ if __name__ == "__main__":
|
|
|
health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
|
|
|
|
|
|
#性别#年龄#婚姻状况
|
|
|
# 1 married or partnered
|
|
@@ -252,21 +268,27 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 纠正2011年统计错误的出生年
|
|
|
demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
|
|
|
- birth_year_2013 = demo[['ID',"birth_year"]]
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
|
|
|
+ birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
|
|
|
# 按 'ID' 列合并两个表
|
|
|
data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
|
|
|
# 使用 fillna() 来更新字段
|
|
|
data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
|
|
|
+ data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
|
|
|
# 删除多余的列
|
|
|
- data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
|
|
|
+ data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
|
|
|
#合并2011年的出生年
|
|
|
- birth_year_2011 = data_2011[['ID',"birth_year"]]
|
|
|
+ birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
|
|
|
# 按 'ID' 列合并两个表
|
|
|
demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
|
|
|
# 使用 fillna() 来更新字段
|
|
|
demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
|
|
|
+ demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
|
|
|
- data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
|
|
|
+ data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
|
|
|
|
|
|
#居住地
|
|
|
data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
@@ -425,6 +447,16 @@ if __name__ == "__main__":
|
|
|
# (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
# (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
data_2013["sleep_state"] = health_status['dc015']
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
|
|
|
data_2013["wave"] = year
|
|
|
change_columns(data_2013)
|
|
@@ -438,6 +470,7 @@ if __name__ == "__main__":
|
|
|
biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
|
|
|
health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
|
|
|
#性别#年龄#婚姻状况
|
|
|
# 1 married or partnered
|
|
@@ -460,8 +493,12 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 2015年的出生年
|
|
|
demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
|
|
|
- data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
|
|
|
+ data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
|
|
|
#居住地
|
|
|
data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
@@ -620,6 +657,16 @@ if __name__ == "__main__":
|
|
|
# (3)Occasionally or a moderate amount of the time (3-4 days) 有时或者说有一半的时间(3-4天)
|
|
|
# (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
data_2015["sleep_state"] = health_status['dc015']
|
|
|
+
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
|
|
|
data_2015["wave"] = year
|
|
|
change_columns(data_2015)
|
|
@@ -632,6 +679,7 @@ if __name__ == "__main__":
|
|
|
health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
|
|
|
cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
|
|
|
#性别#年龄#婚姻状况
|
|
|
# 1 married or partnered
|
|
@@ -648,8 +696,12 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 出生年
|
|
|
demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
|
|
|
|
|
|
- data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
+
|
|
|
+ data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
|
|
|
#居住地
|
|
|
data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
@@ -794,6 +846,16 @@ if __name__ == "__main__":
|
|
|
# (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x)
|
|
|
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
data_2018["wave"] = year
|
|
|
change_columns(data_2018)
|
|
|
data_2018 = pd.concat([data_2015, data_2018], axis=0)
|
|
@@ -803,6 +865,7 @@ if __name__ == "__main__":
|
|
|
demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
|
|
|
psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
|
|
|
health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
|
|
|
+ weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
|
|
|
|
|
|
#性别#年龄#婚姻状况
|
|
|
# 1 married or partnered
|
|
@@ -824,14 +887,21 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 出生年
|
|
|
demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
|
|
|
+ demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
|
|
|
#合并2018年的出生年
|
|
|
- birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
|
|
|
+ birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
|
|
|
# 按 'ID' 列合并两个表
|
|
|
demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
|
|
|
# 使用 fillna() 来更新字段
|
|
|
demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
|
|
|
+ demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
|
|
|
+
|
|
|
+ #获取随访时间
|
|
|
+ demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
|
|
|
|
|
|
- data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
|
|
|
+ demo["ba003"] = 1
|
|
|
+
|
|
|
+ data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', 'education']]
|
|
|
#居住地
|
|
|
data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
|
|
|
|
|
@@ -976,6 +1046,16 @@ if __name__ == "__main__":
|
|
|
# (4)Most or all of the time (5-7 days) 大多数的时间(5-7天)
|
|
|
data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x)
|
|
|
|
|
|
+ #ADL
|
|
|
+ health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
|
|
|
+ data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
|
|
|
+ health_status["db014_score"] + health_status["db015_score"]
|
|
|
+
|
|
|
data_2020["wave"] = year
|
|
|
change_columns(data_2020)
|
|
|
data_2020 = pd.concat([data_2018, data_2020], axis=0)
|
|
@@ -998,5 +1078,6 @@ if __name__ == "__main__":
|
|
|
data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
|
|
|
#襄樊市->襄阳市
|
|
|
data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市')
|
|
|
+
|
|
|
data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
|
|
|
print(123)
|