1 month ago · a4e9339d5c
--- a/CHARLS_P/CHARLS_harmonized.py
+++ b/CHARLS_P/CHARLS_harmonized.py
@@ -1,6 +1,8 @@
 
															 import pandas as pd
														
 
															 import numpy as np
														
 
															 import pyreadstat
														
 
															+from datetime import date
														
 
															+from lunarcalendar import Converter, Lunar
														
 
															 # 定义一个函数，用于更新 harmonized 中的 mstat 列
														
@@ -43,6 +45,43 @@ def merge_data(harmonized, waves, flag="other"):
 
															         )[col_name])
														
 
															     return merged_data
														
 
															+# 通过 groupby 采用少数服从多数原则填充性别
														
 
															+def fill_gender(group, col):
														
 
															+    # 计算性别众数
														
 
															+    mode_gender = group[col].mode()
														
 
															+    if not mode_gender.empty:
														
 
															+        # 用众数替换组内所有性别值
														
 
															+        group[col] = mode_gender[0]
														
 
															+    return group
														
 
															+
														
 
															+def calculate_age(row):
														
 
															+    # 检查空值
														
 
															+    if pd.isnull(row['birth_year']) or pd.isnull(row['birth_month']) or pd.isnull(row['iyear']) or pd.isnull(row['imonth']):
														
 
															+        return np.nan  # 返回 NaN 代表无法计算年龄
														
 
															+
														
 
															+    # 获取出生年月
														
 
															+    birth_year = int(row['birth_year'])
														
 
															+    birth_month = int(row['birth_month'])
														
 
															+    if birth_month == 0:
														
 
															+        birth_month = 6
														
 
															+    # 确定出生日期
														
 
															+    if row['ba003'] == 1:
														
 
															+        # 公历
														
 
															+        birth_date = date(birth_year, birth_month, 1)
														
 
															+    else:
														
 
															+        lunar = Lunar(birth_year, birth_month, 1, isleap=False)
														
 
															+        # 农历
														
 
															+        birth_date = Converter.Lunar2Solar(lunar)
														
 
															+    
														
 
															+    # 获取随访年月
														
 
															+    followup_year = int(row['iyear'])
														
 
															+    followup_month = int(row['imonth'])
														
 
															+    followup_date = date(followup_year, followup_month, 1)
														
 
															+    
														
 
															+    # 计算年龄
														
 
															+    age = followup_date.year - birth_date.year - ((followup_date.month, followup_date.day) < (birth_date.month, birth_date.day))
														
 
															+    return age    
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															     harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
														
 
															     CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
														
@@ -70,7 +109,7 @@ if __name__ == "__main__":
 
															     #BMI
														
 
															     waves = [(2011, "r1mbmi"), (2013, "r2mbmi"), (2015, "r3mbmi")]
														
 
															-    CHARLS_data["BMI"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
 
															+    CHARLS_data["BMI_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
 
															     #收缩压#舒张压
														
 
															     waves = [(2011, "r1systo"), (2013, "r2systo"), (2015, "r3systo")]
														
@@ -135,6 +174,14 @@ if __name__ == "__main__":
 
															     waves = [(2011, "r1sleeprl"), (2013, "r2sleeprl"), (2015, "r3sleeprl"), (2018, "r4sleeprl")]
														
 
															     CHARLS_data["sleep_state_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
 
															+    # ADL
														
 
															+    waves = [(2011, "s1adlab_c"), (2013, "s2adlab_c"), (2015, "s3adlab_c"), (2018, "s4adlab_c")]
														
 
															+    CHARLS_data["ADL_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
 
															+
														
 
															+    #年龄
														
 
															+    waves = [(2011, "r1agey"), (2013, "r2agey"), (2015, "r3agey"), (2018, "r4agey")]
														
 
															+    CHARLS_data["age_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
 
															+
														
 
															     #计算认知功能得分，分成三部分：电话问卷9分，词语回忆10分、画图1分
														
 
															     waves = [(2011, "r1orient"), (2013, "r2orient"), (2015, "r3orient"), (2018, "r4orient")]
														
 
															     CHARLS_data["Date_Naming"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
														
@@ -162,7 +209,10 @@ if __name__ == "__main__":
 
															                   	"Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
														
 
															                     'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
														
 
															                 	'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
														
 
															-                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
														
 
															+                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m', "age_m", "ADL_m"]
														
 
															+    
														
 
															+    #先处理身高1~2单位为米，大于3为cm
														
 
															+    CHARLS_data['Height'] = CHARLS_data['Height'].apply(lambda x: x if 1 <= x <= 2 else (x / 100 if x > 3 else x))
														
 
															     # 遍历 merge_list 列表
														
 
															     for col_m in merge_list:
														
@@ -170,6 +220,10 @@ if __name__ == "__main__":
 
															         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
														
 
															             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
														
 
															+    # 计算BMI
														
 
															+    CHARLS_data['BMI'] = CHARLS_data['Weight'] /(CHARLS_data['Height'] * CHARLS_data['Height'])
														
 
															+    CHARLS_data['BMI'] = CHARLS_data["BMI_m"].fillna(CHARLS_data['BMI'])
														
 
															+
														
 
															     # 处理慢性病标准不一样，将2变为0
														
 
															     chronic_disease = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															                   'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
@@ -183,5 +237,20 @@ if __name__ == "__main__":
 
															         if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
														
 
															             CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
														
 
															-    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture"] + merge_list+ common_new_list)
														
 
															+    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture", "BMI_m"] + merge_list+ common_new_list)
														
 
															+    
														
 
															+    #处理性别
														
 
															+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "rgender")).reset_index(drop=True)
														
 
															+
														
 
															+    #处理出生年月
														
 
															+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_year")).reset_index(drop=True)
														
 
															+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "birth_month")).reset_index(drop=True)
														
 
															+
														
 
															+    #处理教育
														
 
															+    CHARLS_data = CHARLS_data.groupby('ID').apply(lambda group: fill_gender(group, "education")).reset_index(drop=True)
														
 
															+    
														
 
															+    #重新计算年龄
														
 
															+    CHARLS_data["age"] = CHARLS_data.apply(calculate_age, axis=1)
														
 
															+
														
 
															+    
														
 
															     CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv", index=False)
														
--- a/CHARLS_P/CHARLS_preprocess_main.py
+++ b/CHARLS_P/CHARLS_preprocess_main.py
@@ -1,10 +1,12 @@
 
															 import pandas as pd
														
 
															 import numpy as np
														
 
															 import pyreadstat
														
 
															+from datetime import date
														
 
															+from lunarcalendar import Converter, Lunar
														
 
															 #统一列名
														
 
															 def change_columns(df):
														
 
															-    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "marital_status" , "education", 'province', 'city',"Height", "Weight",
														
 
															+    df.columns = ["ID",'householdID','communityID','rgender', "birth_year", "birth_month", "ba003", "iyear", "imonth", "marital_status" , "education", 'province', 'city',"Height", "Weight",
														
 
															                   "waist", "Systolic","Diastolic",
														
 
															                   'bl_wbc','bl_mcv','bl_plt','bl_bun','bl_glu','bl_crea','bl_cho', 'bl_tg', 'bl_hdl', 'bl_ldl','bl_crp', 
														
@@ -18,7 +20,7 @@ def change_columns(df):
 
															                   'Smoke','Drink',
														
 
															-                  "Cognition_score", "Psychiatric_score","sleep_state", "wave"
														
 
															+                  "Cognition_score", "Psychiatric_score","sleep_state", "ADL", "wave",
														
 
															                   ]
														
 
															 # 2020年把帕金森和记忆病症分开，需要和以前对齐   
														
 
															 def process_row(row):
														
@@ -54,7 +56,7 @@ if __name__ == "__main__":
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_status_and_functioning.dta")
														
 
															     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/health_care_and_insurance.dta")
														
 
															     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
														
 
															-
														
 
															+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/weight.dta")
														
 
															     #性别#年龄#居住地#婚姻状况
														
 
															     # 1 married or partnered
														
 
															     # 0 other marital status (separated, divorced, unmarried, or widowed)
														
@@ -65,8 +67,11 @@ if __name__ == "__main__":
 
															     # 1 high school
														
 
															     # 2 college or above
														
 
															     demo["education"] = demo["bd001"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10, 11] else 0 if x in [1,2,3,4,5] else np.nan)
														
 
															+    
														
 
															+    #获取随访时间
														
 
															+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
														
 
															-    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1','marital_status', 'education']]
														
 
															+    data_2011 = demo[['ID','householdID', 'communityID','rgender','ba002_1', 'ba002_2','ba003',"iyear", "imonth" ,'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2011 = pd.merge(data_2011, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
@@ -205,6 +210,16 @@ if __name__ == "__main__":
 
															     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															     data_2011["sleep_state"] = health_status['dc015']
														
 
															+    #ADL
														
 
															+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    data_2011["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
														
 
															+                        health_status["db014_score"] + health_status["db015_score"]
														
 
															+
														
 
															     data_2011["wave"] = year
														
 
															     change_columns(data_2011)
														
 
															     # 2011年的ID和其他年份有一点区别，倒数第三位加0
														
@@ -218,6 +233,7 @@ if __name__ == "__main__":
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
														
 
															     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
														
 
															     exp_income, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/exp_income_wealth.dta")
														
 
															+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Weights.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															     # 1 married or partnered
														
@@ -252,21 +268,27 @@ if __name__ == "__main__":
 
															     # 纠正2011年统计错误的出生年
														
 
															     demo["birth_year"] = demo.apply(lambda x : x["ba002_1"] if not pd.isna(x["ba002_1"]) else np.nan, axis=1)
														
 
															-    birth_year_2013 = demo[['ID',"birth_year"]]
														
 
															+    demo["birth_month"] = demo.apply(lambda x : x["ba002_2"] if not pd.isna(x["ba002_2"]) else np.nan, axis=1)
														
 
															+    birth_year_2013 = demo[['ID',"birth_year", "birth_month"]]
														
 
															     # 按 'ID' 列合并两个表
														
 
															     data_2011 = pd.merge(data_2011, birth_year_2013, on='ID', how='left', suffixes=("_2011","_2013"))
														
 
															     # 使用 fillna() 来更新字段
														
 
															     data_2011['birth_year'] = data_2011['birth_year_2013'].fillna(data_2011['birth_year_2011'])
														
 
															+    data_2011['birth_month'] = data_2011['birth_month_2013'].fillna(data_2011['birth_month_2011'])
														
 
															     # 删除多余的列
														
 
															-    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011'])
														
 
															+    data_2011 = data_2011.drop(columns=['birth_year_2013', 'birth_year_2011', 'birth_month_2013', 'birth_month_2011'])
														
 
															     #合并2011年的出生年
														
 
															-    birth_year_2011 = data_2011[['ID',"birth_year"]]
														
 
															+    birth_year_2011 = data_2011[['ID',"birth_year", "birth_month"]]
														
 
															     # 按 'ID' 列合并两个表
														
 
															     demo = pd.merge(demo, birth_year_2011, on='ID', how='left', suffixes=("_2013","_2011"))
														
 
															     # 使用 fillna() 来更新字段
														
 
															     demo['birth_year'] = demo['birth_year_2013'].fillna(demo['birth_year_2011'])
														
 
															+    demo['birth_month'] = demo['birth_month_2013'].fillna(demo['birth_month_2011'])
														
 
															+
														
 
															+    #获取随访时间
														
 
															+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
														
 
															-    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','marital_status', "education"]]
														
 
															+    data_2013 = demo[['ID','householdID', 'communityID','ba000_w2_3','birth_year','birth_month','ba003',"iyear", "imonth", 'marital_status', "education"]]
														
 
															     #居住地
														
 
															     data_2013 = pd.merge(data_2013, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
@@ -425,6 +447,16 @@ if __name__ == "__main__":
 
															     # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															     data_2013["sleep_state"] = health_status['dc015']
														
 
															+
														
 
															+    #ADL
														
 
															+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    data_2013["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
														
 
															+                        health_status["db014_score"] + health_status["db015_score"]
														
 
															     data_2013["wave"] = year
														
 
															     change_columns(data_2013)
														
@@ -438,6 +470,7 @@ if __name__ == "__main__":
 
															     biomarkers, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Biomarker.dta")
														
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
														
 
															     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
														
 
															+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															     # 1 married or partnered
														
@@ -460,8 +493,12 @@ if __name__ == "__main__":
 
															     # 2015年的出生年
														
 
															     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba002"]==1 else x["ba002_1"] if x["ba002"]==2 else np.nan, axis=1)
														
 
															+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba002"]==1 else x["ba002_2"] if x["ba002"]==2 else np.nan, axis=1)
														
 
															+    
														
 
															+    #获取随访时间
														
 
															+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
														
 
															-    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year', 'marital_status', 'education']]
														
 
															+    data_2015 = demo[['ID','householdID', 'communityID','ba000_w2_3', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2015 = pd.merge(data_2015, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
@@ -620,6 +657,16 @@ if __name__ == "__main__":
 
															     # (3)Occasionally or a moderate amount of the time (3-4 days)　有时或者说有一半的时间（3-4天） 
														
 
															     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															     data_2015["sleep_state"] = health_status['dc015']
														
 
															+
														
 
															+    #ADL
														
 
															+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    data_2015["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
														
 
															+                        health_status["db014_score"] + health_status["db015_score"]
														
 
															     data_2015["wave"] = year
														
 
															     change_columns(data_2015)
														
@@ -632,6 +679,7 @@ if __name__ == "__main__":
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
														
 
															     health_care, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Care_and_Insurance.dta")
														
 
															     cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
														
 
															+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															     # 1 married or partnered
														
@@ -648,8 +696,12 @@ if __name__ == "__main__":
 
															     # 出生年
														
 
															     demo["birth_year"] = demo.apply(lambda x : x["ba004_w3_1"] if x["ba005_w4"]==1 else x["ba002_1"] if x["ba005_w4"]==2 else np.nan, axis=1)
														
 
															+    demo["birth_month"] = demo.apply(lambda x : x["ba004_w3_2"] if x["ba005_w4"]==1 else x["ba002_2"] if x["ba005_w4"]==2 else np.nan, axis=1)
														
 
															-    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
														
 
															+    #获取随访时间
														
 
															+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
														
 
															+
														
 
															+    data_2018 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2018 = pd.merge(data_2018, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
@@ -794,6 +846,16 @@ if __name__ == "__main__":
 
															     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															     data_2018["sleep_state"] = cognition['dc015'].apply(lambda x : np.nan if x > 4 else x) 
														
 
															+    #ADL
														
 
															+    health_status["db010_score"] = health_status["db010"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db011_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db012_score"] = health_status["db012"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db013_score"] = health_status["db013"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db014_score"] = health_status["db014"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db015_score"] = health_status["db015"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    data_2018["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
														
 
															+                        health_status["db014_score"] + health_status["db015_score"]
														
 
															+    
														
 
															     data_2018["wave"] = year
														
 
															     change_columns(data_2018)
														
 
															     data_2018 = pd.concat([data_2015, data_2018], axis=0)
														
@@ -803,6 +865,7 @@ if __name__ == "__main__":
 
															     demo, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Demographic_Background.dta")
														
 
															     psu, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/PSU.dta", encoding='gbk')
														
 
															     health_status, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Health_Status_and_Functioning.dta")
														
 
															+    weight, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Sample_Infor.dta")
														
 
															     #性别#年龄#婚姻状况
														
 
															     # 1 married or partnered
														
@@ -824,14 +887,21 @@ if __name__ == "__main__":
 
															     # 出生年
														
 
															     demo["birth_year"] = demo.apply(lambda x : x["ba003_1"] if pd.isna(x["ba003_1"]) else np.nan, axis=1)
														
 
															+    demo["birth_month"] = demo.apply(lambda x : x["ba003_2"] if pd.isna(x["ba003_2"]) else np.nan, axis=1)
														
 
															     #合并2018年的出生年
														
 
															-    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year"]]
														
 
															+    birth_year_2018 = data_2018[data_2018["wave"]=="2018"][['ID',"birth_year", "birth_month"]]
														
 
															     # 按 'ID' 列合并两个表
														
 
															     demo = pd.merge(demo, birth_year_2018, on='ID', how='left', suffixes=("_2020","_2018"))
														
 
															     # 使用 fillna() 来更新字段
														
 
															     demo['birth_year'] = demo['birth_year_2020'].fillna(demo['birth_year_2018'])
														
 
															+    demo['birth_month'] = demo['birth_month_2020'].fillna(demo['birth_month_2018'])
														
 
															+
														
 
															+    #获取随访时间
														
 
															+    demo = pd.merge(demo, weight[["ID", "iyear", "imonth"]], on = "ID", how="left")
														
 
															-    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year', 'marital_status', 'education']]
														
 
															+    demo["ba003"] = 1
														
 
															+
														
 
															+    data_2020 = demo[['ID','householdID', 'communityID','xrgender', 'birth_year','birth_month','ba003',"iyear",  "imonth", 'marital_status', 'education']]
														
 
															     #居住地
														
 
															     data_2020 = pd.merge(data_2020, psu[['communityID', 'province', 'city']], on = "communityID", how="left")
														
@@ -976,6 +1046,16 @@ if __name__ == "__main__":
 
															     # (4)Most or all of the time (5-7 days) 大多数的时间（5-7天） 
														
 
															     data_2020["sleep_state"] = health_status['dc022'].apply(lambda x : np.nan if x >900 else x) 
														
 
															+    #ADL
														
 
															+    health_status["db010_score"] = health_status["db001"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db011_score"] = health_status["db003"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db012_score"] = health_status["db005"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db013_score"] = health_status["db007"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db014_score"] = health_status["db009"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    health_status["db015_score"] = health_status["db011"].apply(lambda x : 0 if x==1 else 1 if x >= 2 else np.nan)
														
 
															+    data_2020["ADL"] = health_status["db010_score"] + health_status["db011_score"] + health_status["db012_score"] + health_status["db013_score"] + \
														
 
															+                        health_status["db014_score"] + health_status["db015_score"]
														
 
															+    
														
 
															     data_2020["wave"] = year
														
 
															     change_columns(data_2020)
														
 
															     data_2020 = pd.concat([data_2018, data_2020], axis=0)
														
@@ -998,5 +1078,6 @@ if __name__ == "__main__":
 
															     data_2020['city'] = data_2020['city'].replace('巢湖市', '合肥市')
														
 
															     #襄樊市->襄阳市
														
 
															     data_2020['city'] = data_2020['city'].replace('襄樊市', '襄阳市') 
														
 
															+
														
 
															     data_2020.to_csv("/root/r_base/CHARLS/result_all_new.csv", index=False)
														
 
															     print(123)
														
--- a/paper_code/code.R
+++ b/paper_code/code.R
@@ -3,58 +3,63 @@
 
															 library(msm)
														
 
															 library(survival)
														
 
															-# data <- data.frame(
														
 
															-#   ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
														
 
															-#   time = c(0, 1, 2, 0, 1, 2),      # 随访时间
														
 
															-#   state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
														
 
															-#   birth_year = c(1970, 1970, 1970, 1980, 1980, 1980),
														
 
															-#   gender = c(1, 1, 1, 2, 2, 2),    # 性别
														
 
															-#   education = c(3, 3, 3, 2, 2, 2)  # 教育程度
														
 
															-# )
														
 
															-# statetable.msm(state, ID, data = data)
														
 
															-
														
 
															-# qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
														
 
															-#                          0.1, -0.3, 0.2,
														
 
															-#                          0, 0, 0), 
														
 
															-#                        nrow = 3, byrow = TRUE)
														
 
															-
														
 
															-# msm_model <- msm(state ~ time, subject = ID, data = data,
														
 
															-#                  qmatrix = qmatrix_init, 
														
 
															-#                  covariates = ~ gender + education)
														
 
															-# pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
														
 
															-# summary(msm_model)
														
 
															-
														
 
															-# 创建数据框
														
 
															-data <- data.frame(
														
 
															-  ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
														
 
															-  time = c(0, 1, 2, 0, 1, 2),      # 随访时间
														
 
															-  state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
														
 
															-  birth_year = c(1970, 1970, 1970, 1980, 1980, 1980), # 出生年份
														
 
															-  gender = c(1, 1, 1, 2, 2, 2),    # 性别
														
 
															-  education = c(3, 3, 3, 2, 2, 2)  # 教育程度
														
 
															-)
														
 
															+# View(cav)
														
 
															+data <- read.csv("paper_data.csv")
														
 
															+
														
 
															+# data$age_group <- cut(data$age,
														
 
															+#                       breaks = c(45, 55, 65, Inf),
														
 
															+#                       labels = c("45-54", "55-64",">=65"),
														
 
															+#                       right = FALSE)
														
 
															+
														
 
															+data$gender_ <- factor(data$rgender, levels = c(1, 2), labels = c("男", "女"))
														
 
															+
														
 
															+summary(data$age_group)
														
 
															+# View(data[, c("age", "age_group")])
														
 
															+View(data$age_group)
														
 
															 # 计算状态转移频数表
														
 
															 freq_table <- statetable.msm(state, ID, data = data)
														
 
															 print(freq_table)
														
 
															 # 初始化转移速率矩阵
														
 
															-qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
														
 
															-                          0.1, -0.3, 0.2,
														
 
															-                          0.3, 0.2, -0.5), 
														
 
															-                        nrow = 3, byrow = TRUE)
														
 
															+qmatrix_init <- matrix(c(-0.5, 0.25, 0.15, 0.1,
														
 
															+                          0.1, -0.3, 0.1, 0.1,
														
 
															+                          0.3, 0.1, -0.5, 0.1,
														
 
															+                          0,   0,    0,   0), 
														
 
															+                        nrow = 4, byrow = TRUE)
														
 
															 # 创建初始模型
														
 
															-crude_init <- crudeinits.msm(state ~ time+ gender, subject = ID, data = data, qmatrix = qmatrix_init)
														
 
															+crude_init <- crudeinits.msm(state ~ wave, subject = ID, data = data, qmatrix = qmatrix_init)
														
 
															+
														
 
															+View(crude_init)
														
 
															 # 进行多状态模型分析
														
 
															-msm_model <- msm(state ~ time, subject = ID, data = data,
														
 
															+msm_model <- msm(state ~ wave, subject = ID, data = data,
														
 
															                  qmatrix = crude_init,
														
 
															-                 covariates = ~ gender+education)
														
 
															+                 covariates = ~ gender_, 
														
 
															+                 death = 4,
														
 
															+                 method = "BFGS", control = list(fnscale = 4000, maxit = 10000)
														
 
															+                 )
														
 
															 # 计算状态转移概率矩阵
														
 
															-prob_matrix <- pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
														
 
															+prob_matrix <- pmatrix.msm(msm_model, t = 5)  # t = 1 代表随访之间的间隔时间
														
 
															 print(prob_matrix)
														
 
															+# 输出拟合模型的速率矩阵
														
 
															+q_matrix <- qmatrix.msm(msm_model)
														
 
															+print(q_matrix)
														
 
															+
														
 
															+# 提取转移强度
														
 
															+transition_intensity <- msm_model$qmatrix
														
 
															+print(transition_intensity)
														
 
															+
														
 
															+# 计算在每个状态中的平均逗留时间
														
 
															+so_journ <- sojourn.msm(msm_model)
														
 
															+print(so_journ)
														
 
															+
														
 
															+# 计算均衡状态概率
														
 
															+
														
 
															 # 查看模型的详细结果
														
 
															 summary(msm_model)
														
 
															+
														
 
															+rm(list = ls())
														
--- a/paper_code/data_preprocess.py
+++ b/paper_code/data_preprocess.py
@@ -0,0 +1,178 @@
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+import pyreadstat
														
 
															+from sklearn.experimental import enable_iterative_imputer
														
 
															+from sklearn.impute import IterativeImputer
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
														
 
															+
														
 
															+    cavariates = ["last_year_NO2", 	"before_last_NO2", 	"last_year_O3", "before_last_O3", 
														
 
															+                        "last_year_pm1",	"before_last_pm1",	"last_year_pm2.5",	"before_last_pm2.5",	"last_year_pm10",
														
 
															+                    	'before_last_pm10',	'last_year_SO4',	'last_year_NO3',	'last_year_NH4',	'last_year_OM',	'last_year_BC',	'before_last_SO4',
														
 
															+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'last_year_nl',	'before_last_nl']
														
 
															+    #挑出需要的字段
														
 
															+    data = CHARLS_data[["ID", "rgender", "age", "marital_status", "education", "Physical_activity", "Psychiatric_score", "BMI", "ADL", "Smoke", "Drink",
														
 
															+                        'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma', 'wave'
														
 
															+                        ]+cavariates]
														
 
															+    #处理共病状态
														
 
															+    data["state"] = data['Hypertension'] + data['Dyslipidemia']+ data['Disabetes_or_High_Blood_Sugar'] + data['Cancer_or_Malignant_Tumor'] + data['Chronic_Lung_Diseases'] + data['Liver_Disease'] \
														
 
															+        + data['Heart_Problems'] + data['Stroke'] + data['Kidney_Diease'] + data['Stomach_or_Other_Digestive_Disease'] + data['Emotional_Nervous_or_Psychiatric_Problems'] + data['Memory_Related_Disease'] \
														
 
															+        + data['Arthritis_or_Rheumatism'] + data['Asthma']
														
 
															+    data["state"] = data['state'].apply(lambda x : 1 if x == 0 else 2 if x == 1 else 3 if x >= 2 else np.nan)
														
 
															+
														
 
															+    data = data.drop(columns=['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
														
 
															+                        'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
														
 
															+                        'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma'])
														
 
															+    
														
 
															+    #增加一列死亡状态
														
 
															+    #0：未死亡
														
 
															+    #1：死亡 
														
 
															+    #读取2013年的死亡数据
														
 
															+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/Exit_Interview.dta")
														
 
															+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
														
 
															+    exit["exit_year"] = exit["exb001_1"]
														
 
															+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
														
 
															+
														
 
															+    #读取2020年的死亡数据
														
 
															+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2020/Exit_Module.dta")
														
 
															+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
														
 
															+    exit["exit_year"] = exit["exb001_1"]
														
 
															+    data = pd.merge(data, exit[['ID', "exit_year"]], on = "ID", how="left")
														
 
															+    
														
 
															+    #合并两次死亡数据
														
 
															+    data["exit_year"] = data["exit_year_x"].fillna(data["exit_year_y"])
														
 
															+
														
 
															+    # 定义随访年的列表
														
 
															+    followup_years = [2011, 2013, 2015, 2018, 2020]
														
 
															+
														
 
															+    # 将 'id' 和 'wave' 转换为整型，确保排序不会出问题
														
 
															+    data['ID'] = data['ID'].astype(int)
														
 
															+    data['wave'] = data['wave'].astype(int)
														
 
															+
														
 
															+    # 找到每个死亡个体的最后一次随访数据
														
 
															+    last_followup = data.dropna(subset=['exit_year']).groupby('ID').apply(lambda x: x[x['wave'] == x['wave'].max()])
														
 
															+
														
 
															+    # 创建一个布尔掩码，用于标记需要将 'state' 修改为 4 的行
														
 
															+    mask = last_followup['wave'] > last_followup['exit_year']
														
 
															+
														
 
															+    # 将 MultiIndex 转换为单一的 ID 索引
														
 
															+    last_followup_ids = last_followup[mask].index.get_level_values('ID')
														
 
															+
														
 
															+    # 使用布尔索引直接在 data 中修改对应行的 'state'
														
 
															+    data.loc[data['ID'].isin(last_followup_ids) & (data['wave'] == data.groupby('ID')['wave'].transform('max')), 'state'] = 4
														
 
															+
														
 
															+    # 创建新的记录并为每个死亡个体设置下一次随访年
														
 
															+    new_rows = last_followup[last_followup['wave'] <= last_followup['exit_year']].copy()
														
 
															+    new_rows['wave'] = new_rows['wave'].apply(lambda x: next((year for year in followup_years if year > x), None))
														
 
															+    new_rows['state'] = 4
														
 
															+
														
 
															+    # 将新行添加到原始 DataFrame 中
														
 
															+    data = pd.concat([data, new_rows], ignore_index=True).sort_values(by=['ID', 'wave']).reset_index(drop=True)
														
 
															+
														
 
															+    #删除多余列
														
 
															+    data = data.drop(columns=["exit_year_x", "exit_year_y", "exit_year"])
														
 
															+
														
 
															+    # 统计唯一用户的个数
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除空状态前的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+
														
 
															+    # 将状态为空的数据删除
														
 
															+    # 查找所有有空值 state 的用户 ID
														
 
															+    users_with_na = data[data['state'].isna()]['ID'].unique()
														
 
															+
														
 
															+    # 删除这些用户的所有数据
														
 
															+    data = data[~data['ID'].isin(users_with_na)]
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除空状态后的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+
														
 
															+    #获取参加全部批次的纵向数据
														
 
															+    # 1. 统计每个用户的批次数
														
 
															+    user_counts = data.groupby('ID')['wave'].count().reset_index()
														
 
															+    user_counts.columns = ['ID', 'wave_count']
														
 
															+
														
 
															+    # 2. 找到每个用户的最大批次的状态
														
 
															+    max_wave_state = data.loc[data.groupby('ID')['wave'].idxmax()][['ID', 'state']]
														
 
															+    max_wave_state.columns = ['ID', 'max_wave_state']
														
 
															+
														
 
															+    # 3. 将用户的批次数和最大批次状态合并回原始数据
														
 
															+    data = data.merge(user_counts, on='ID').merge(max_wave_state, on='ID')
														
 
															+
														
 
															+    # 4. 筛选满足条件的用户
														
 
															+    condition_1 = (data['wave_count'] == 5)
														
 
															+    condition_2 = (data['max_wave_state'] == 4) & (data['wave_count'] > 1)
														
 
															+    data = data[condition_1 | condition_2]
														
 
															+
														
 
															+    # 5. 清除多余的列
														
 
															+    data = data.drop(columns=['wave_count', 'max_wave_state']).reset_index(drop=True)
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"参加全部批次的用户数：{unique_user_count}")
														
 
															+
														
 
															+    
														
 
															+    #删除45岁以下的用户ID
														
 
															+    users_with_45_age = data[data['age']<45]['ID'].unique()
														
 
															+
														
 
															+    # 删除这些用户的所有数据
														
 
															+    data = data[~data['ID'].isin(users_with_45_age)]
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除45岁后的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+
														
 
															+    # 查找所有有空值age 的用户 ID
														
 
															+    users_with_na_age = data[data['age'].isna()]['ID'].unique()
														
 
															+
														
 
															+    # 删除这些用户的所有数据
														
 
															+    data = data[~data['ID'].isin(users_with_na_age)]
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除空年龄后的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+
														
 
															+    # 查找所有有空值education 的用户 ID
														
 
															+    users_with_na_education = data[data['education'].isna()]['ID'].unique()
														
 
															+
														
 
															+    # 删除这些用户的所有数据
														
 
															+    data = data[~data['ID'].isin(users_with_na_education)]
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除空教育后的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+
														
 
															+    #删除异常的BMI
														
 
															+    users_with_BMI_wr = data[data["BMI"]>=200]['ID'].unique()
														
 
															+    # 删除这些用户的所有数据
														
 
															+    data = data[~data['ID'].isin(users_with_BMI_wr)]
														
 
															+
														
 
															+    unique_user_count = data['ID'].nunique()
														
 
															+    print(f"删除异常BMI的用户数：{unique_user_count}")
														
 
															+
														
 
															+
														
 
															+    #多重插补
														
 
															+    # 初始化多重插补模型
														
 
															+    imputer = IterativeImputer(max_iter=10, random_state=0)
														
 
															+
														
 
															+    # 进行多重插补
														
 
															+    imputed_data = imputer.fit_transform(data)
														
 
															+
														
 
															+    # 将插补后的数据转换为 DataFrame
														
 
															+    data = pd.DataFrame(imputed_data, columns=data.columns)
														
 
															+    # 将分类变量列取整
														
 
															+    categorical_columns = ['Physical_activity', 'Psychiatric_score', 'ADL', 'Smoke', 'Drink']  # 分类变量列名
														
 
															+    data[categorical_columns] = data[categorical_columns].round().astype(int)
														
 
															+    # 修正负值，将所有小于 0 的值替换为 0 或指定的最小值
														
 
															+    data[data < 0] = 0
														
 
															+
														
 
															+    #排序将ID相同的放到一起
														
 
															+    data.to_csv("paper_data.csv", index=False)