2 Revize ac61a6b014 ... 3f9500c2be

Autor SHA1 Zpráva Datum
  JazzZhao 3f9500c2be CHARLS数据库优化 před 1 měsícem
  JazzZhao 7af394ed45 调整认知的得分 před 1 měsícem

+ 26 - 0
CHARLS_P/CHARLS_NDVI.py

@@ -0,0 +1,26 @@
+import pandas as pd
+
+if __name__ == "__main__":
+    years = [2011, 2013,2015, 2018, 2020]
+
+    #读取CHARLS数据
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
+    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd.csv",index=False)
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
+    
+    #读取NDVI数据
+    ndvi_data = pd.read_excel(f"NDVI/【立方数据学社】地级市等级的逐年NDVI.xlsx")
+
+    for year in years:
+        #新增两列,分别为year的去年和前年的环境值
+        # CHARLS_data[['last_year_pm2.5', "before_last_pm2.5"]]=''
+        #开始筛选出year的数据
+        CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+        #两个表合并
+        table_merge = pd.merge(CHARLS_data_year, ndvi_data, left_on="city", right_on="CITY", how='left')
+        # table_merge_last.to_csv("123.csv",index=False)
+        #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_ndvi'] = table_merge[str(year-1)].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_ndvi'] = table_merge[str(year-2)].values
+        print(year)
+    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd.csv",index=False)

+ 5 - 3
CHARLS_P/CHARLS_PM.py

@@ -13,9 +13,11 @@ def pollutant_handle(path):
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #两个表合并
         table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
-        #更新CHARLS表
-        CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_pm10'] = table_merge[str(year-1)].values
-        CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_pm10'] = table_merge[str(year-2)].values
+        if str(year - 1) in table_merge.columns:
+            #更新CHARLS表
+            CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_pm10'] = table_merge[str(year-1)].values
+        if str(year - 2) in table_merge.columns:
+            CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_pm10'] = table_merge[str(year-2)].values
         CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
         print(year)
 

+ 24 - 0
CHARLS_P/CHARLS_exit.py

@@ -0,0 +1,24 @@
+import pandas as pd
+import pyreadstat
+
+if __name__ == "__main__":
+    #读取CHARLS数据
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_mete.csv")
+    CHARLS_data.to_csv("CHARLS_data_pollutants_exit.csv",index=False)
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_exit.csv")
+    
+    #增加一列死亡状态
+    #0:未死亡
+    #1:死亡 
+    #读取2013年的死亡数据
+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2013/Exit_Interview.dta")
+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
+    exit["exit_year"] = exit["exb001_1"]
+    CHARLS_data = pd.merge(CHARLS_data, exit[['ID', "exit_year"]], on = "ID", how="left")
+
+    #读取2020年的死亡数据
+    exit, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS2020/Exit_Module.dta")
+    exit['ID'] = pd.to_numeric(exit['ID'], errors='coerce').astype('Int64')
+    exit["exit_year"] = exit["exb001_1"]
+    CHARLS_data = pd.merge(CHARLS_data, exit[['ID', "exit_year"]], on = "ID", how="left")
+    CHARLS_data.to_csv("CHARLS_data_pollutants_exit.csv",index=False)

+ 187 - 0
CHARLS_P/CHARLS_harmonized.py

@@ -0,0 +1,187 @@
+import pandas as pd
+import numpy as np
+import pyreadstat
+
+
+# 定义一个函数,用于更新 harmonized 中的 mstat 列
+def update_mstat(harmonized, col_name):
+    harmonized[col_name] = harmonized[col_name].apply(
+        lambda x: 1 if x in [1, 3] else 0 if x in [4, 5, 7, 8] else np.nan
+    )
+
+def update_physical(harmonized):
+    harmonized["r1phys"] = harmonized.apply(lambda x : 2 if x["r1vgact_c"]==1 else 
+                                                             1 if x["r1mdact_c"]==1 else 
+                                                             0 if x["r1ltact_c"] == 1 or (x["r1vgact_c"]==0 and x["r1mdact_c"]==0 and x["r1ltact_c"] == 0) 
+                                                             else np.nan ,axis=1)
+    harmonized["r2phys"] = harmonized.apply(lambda x : 2 if x["r2vgact_c"]==1 else 
+                                                             1 if x["r2mdact_c"]==1 else 
+                                                             0 if x["r2ltact_c"] == 1 or (x["r2vgact_c"]==0 and x["r2mdact_c"]==0 and x["r2ltact_c"] == 0) 
+                                                             else np.nan ,axis=1)
+    harmonized["r3phys"] = harmonized.apply(lambda x : 2 if x["r3vgact_c"]==1 else 
+                                                             1 if x["r3mdact_c"]==1 else 
+                                                             0 if x["r3ltact_c"] == 1 or (x["r3vgact_c"]==0 and x["r3mdact_c"]==0 and x["r3ltact_c"] == 0) 
+                                                             else np.nan ,axis=1)
+    harmonized["r4phys"] = harmonized.apply(lambda x : 2 if x["r4vgact_c"]==1 else 
+                                                             1 if x["r4mdact_c"]==1 else 
+                                                             0 if x["r4ltact_c"] == 1 or (x["r4vgact_c"]==0 and x["r4mdact_c"]==0 and x["r4ltact_c"] == 0) 
+                                                             else np.nan ,axis=1)
+def merge_data(harmonized, waves, flag="other"):
+    merged_data = []
+    # 遍历年份和列名,处理合并数据
+    for wave, col_name in waves:
+        if flag=="mstat":
+            update_mstat(harmonized, col_name)
+        elif flag == "phys":
+            update_physical(harmonized)
+        # 获取对应年份的数据,并将结果存入列表
+        merged_data.append(pd.merge(
+            CHARLS_data[CHARLS_data["wave"] == wave],
+            harmonized[["ID", col_name]],
+            on="ID",
+            how="left"
+        )[col_name])
+    return merged_data
+
+if __name__ == "__main__":
+    harmonized, meta = pyreadstat.read_dta("/root/r_base/CHARLS/Harmonized_CHARLS/H_CHARLS_D_Data.dta")
+    CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd.csv")
+    harmonized['ID'] = harmonized['ID'].astype(str)  # 转换为字符串
+    CHARLS_data['ID'] = CHARLS_data['ID'].astype(str)  # 转换为字符串
+    #婚姻状况
+    # 1 married or partnered
+    # 0 other marital status (separated, divorced, unmarried, or widowed)
+    # 定义年份和对应的列名
+    waves = [(2011, "r1mstat"), (2013, "r2mstat"), (2015, "r3mstat"), (2018, "r4mstat")]
+    # 将四列数据合并为一列,并赋值给 CHARLS_data["mstat"]
+    CHARLS_data["marital_status_m"] = pd.concat(merge_data(harmonized, waves, "mstat"), ignore_index=True)
+
+    #身高
+    waves = [(2011, "r1mheight"), (2013, "r2mheight"), (2015, "r3mheight")]
+    CHARLS_data["Height_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #体重
+    waves = [(2011, "r1mweight"), (2013, "r2mweight"), (2015, "r3mweight")]
+    CHARLS_data["Weight_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #腰围
+    waves = [(2011, "r1mwaist"), (2013, "r2mwaist"), (2015, "r3mwaist")]
+    CHARLS_data["waist_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #BMI
+    waves = [(2011, "r1mbmi"), (2013, "r2mbmi"), (2015, "r3mbmi")]
+    CHARLS_data["BMI"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #收缩压#舒张压
+    waves = [(2011, "r1systo"), (2013, "r2systo"), (2015, "r3systo")]
+    CHARLS_data["Systolic_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1diasto"), (2013, "r2diasto"), (2015, "r3diasto")]
+    CHARLS_data["Diastolic_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    # 体力活动
+    # 2 vigorous (vigorous activity more than once a week)
+    # 1 moderate (moderate activity more than once a week)
+    # 0 inactive (the rest)
+    waves = [(2011, "r1phys"), (2013, "r2phys"), (2015, "r3phys"), (2018, "r4phys")]
+    CHARLS_data["Physical_activity_m"] = pd.concat(merge_data(harmonized, waves, "phys"), ignore_index=True)
+
+    # 抽烟
+    # 1 抽过烟
+    # 0 没有抽过烟
+    waves = [(2011, "r1smokev"), (2013, "r2smokev"), (2015, "r3smokev"), (2018, "r4smokev")]
+    CHARLS_data["Smoke_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    # 喝酒
+    # 1 喝过酒
+    # 0 没有喝过酒
+    waves = [(2011, "r1drinkev"), (2013, "r2drinkev"), (2015, "r3drinkev"), (2018, "r4drinkev")]
+    CHARLS_data["Drink_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #慢性病
+    waves = [(2011, "r1hibpe"), (2013, "r2hibpe"), (2015, "r3hibpe"), (2018, "r4hibpe")]
+    CHARLS_data["Hypertension_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1diabe"), (2013, "r2diabe"), (2015, "r3diabe"), (2018, "r4diabe")]
+    CHARLS_data["Disabetes_or_High_Blood_Sugar_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1cancre"), (2013, "r2cancre"), (2015, "r3cancre"), (2018, "r4cancre")]
+    CHARLS_data["Cancer_or_Malignant_Tumor_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1lunge"), (2013, "r2lunge"), (2015, "r3lunge"), (2018, "r4lunge")]
+    CHARLS_data["Chronic_Lung_Diseases_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1hearte"), (2013, "r2hearte"), (2015, "r3hearte"), (2018, "r4hearte")]
+    CHARLS_data["Heart_Problems_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1psyche"), (2013, "r2psyche"), (2015, "r3psyche"), (2018, "r4psyche")]
+    CHARLS_data["Emotional_Nervous_or_Psychiatric_Problems_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1stroke"), (2013, "r2stroke"), (2015, "r3stroke"), (2018, "r4stroke")]
+    CHARLS_data["Stroke_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1arthre"), (2013, "r2arthre"), (2015, "r3arthre"), (2018, "r4arthre")]
+    CHARLS_data["Arthritis_or_Rheumatism_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1dyslipe"), (2013, "r2dyslipe"), (2015, "r3dyslipe"), (2018, "r4dyslipe")]
+    CHARLS_data["Dyslipidemia_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1livere"), (2013, "r2livere"), (2015, "r3livere"), (2018, "r4livere")]
+    CHARLS_data["Liver_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1kidneye"), (2013, "r2kidneye"), (2015, "r3kidneye"), (2018, "r4kidneye")]
+    CHARLS_data["Kidney_Diease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1digeste"), (2013, "r2digeste"), (2015, "r3digeste"), (2018, "r4digeste")]
+    CHARLS_data["Stomach_or_Other_Digestive_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1asthmae"), (2013, "r2asthmae"), (2015, "r3asthmae"), (2018, "r4asthmae")]
+    CHARLS_data["Asthma_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1memrye"), (2013, "r2memrye"), (2015, "r3memrye"), (2018, "r4memrye")]
+    CHARLS_data["Memory_Related_Disease_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #心理评分
+    waves = [(2011, "s1cesd10"), (2013, "s2cesd10"), (2015, "s3cesd10"), (2018, "s4cesd10")]
+    CHARLS_data["Psychiatric_score_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #睡眠状态
+    waves = [(2011, "r1sleeprl"), (2013, "r2sleeprl"), (2015, "r3sleeprl"), (2018, "r4sleeprl")]
+    CHARLS_data["sleep_state_m"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
+    waves = [(2011, "r1orient"), (2013, "r2orient"), (2015, "r3orient"), (2018, "r4orient")]
+    CHARLS_data["Date_Naming"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1imrc"), (2013, "r2imrc"), (2015, "r3imrc"), (2018, "r4imrc")]
+    CHARLS_data["Immediate_Word_Recall"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1dlrc"), (2013, "r2dlrc"), (2015, "r3dlrc"), (2018, "r4dlrc")]
+    CHARLS_data["Delayed_Word_Recall"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1ser7"), (2013, "r2ser7"), (2015, "r3ser7"), (2018, "r4ser7")]
+    CHARLS_data["Serial_7"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+    waves = [(2011, "r1draw"), (2013, "r2draw"), (2015, "r3draw"), (2018, "r4draw")]
+    CHARLS_data["Drawing_Picture"] = pd.concat(merge_data(harmonized, waves), ignore_index=True)
+
+    CHARLS_data["Cognition_score_m"] = CHARLS_data["Date_Naming"] + CHARLS_data["Immediate_Word_Recall"] + CHARLS_data["Delayed_Word_Recall"] + CHARLS_data["Serial_7"] + CHARLS_data["Drawing_Picture"]
+
+    # 整体合并的:性别,出生年,教育
+    #教育
+    # 0 below high school
+    # 1 high school
+    # 2 college or above
+    harmonized["raeduc_c"] = harmonized["raeduc_c"].apply(lambda x : 1 if x == 6 or x == 7 else 2 if x in [8, 9, 10] else 0 if x in [1,2,3,4,5] else np.nan)
+    CHARLS_data = pd.merge(CHARLS_data, harmonized[["ID", "ragender", "rabyear", "raeduc_c"]], on='ID', how='left')
+
+    #合并
+    merge_list = ["marital_status_m",	"Height_m",	"Weight_m",	"waist_m",	"Systolic_m",	"Diastolic_m",
+                  	"Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
+                    'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
+                	'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
+                	'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
+    
+    # 遍历 merge_list 列表
+    for col_m in merge_list:
+        col = col_m.replace('_m', '')  # 去掉 '_m' 得到相应的列名
+        if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
+            CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
+
+    # 处理慢性病标准不一样,将2变为0
+    chronic_disease = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
+                  'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
+                  'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
+    CHARLS_data[chronic_disease] = CHARLS_data[chronic_disease].replace(2, 0)
+
+    #处理"ragender", "rabyear", "raeduc_c"
+    common_new_list = ["ragender", "rabyear", "raeduc_c"]
+    common_list = ["rgender", "birth_year", "education"]
+    for col_m, col in zip(common_new_list, common_list):
+        if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
+            CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
+
+    CHARLS_data = CHARLS_data.drop(columns=["Date_Naming", "Immediate_Word_Recall", "Delayed_Word_Recall", "Serial_7", "Drawing_Picture"] + merge_list+ common_new_list)
+    CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv", index=False)

+ 14 - 14
CHARLS_P/CHARLS_preprocess_main.py

@@ -134,12 +134,12 @@ if __name__ == "__main__":
     data_2011 = pd.merge(data_2011, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
 
     
-    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆20分、画图1分
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆20分、画图1分
     health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
     health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
     health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
-    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
     health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
     health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
@@ -173,7 +173,7 @@ if __name__ == "__main__":
     health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
 
     data_2011["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
-        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
         health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
         health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
@@ -354,12 +354,12 @@ if __name__ == "__main__":
     
     data_2013 = pd.merge(data_2013, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
 
-    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
     health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
     health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
     health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
-    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
     health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
     health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
@@ -393,7 +393,7 @@ if __name__ == "__main__":
     health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
 
     data_2013["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
-        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
         health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
         health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
@@ -550,12 +550,12 @@ if __name__ == "__main__":
     
     data_2015 = pd.merge(data_2015, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
 
-    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
     health_status["dc001s1_score"] = health_status["dc001s1"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc001s2_score"] = health_status["dc001s2"].apply(lambda x : 1 if x==2 else 0 if pd.isna(x) else 0)
     health_status["dc001s3_score"] = health_status["dc001s3"].apply(lambda x : 1 if x==3 else 0 if pd.isna(x) else 0)
     health_status["dc002_score"] = health_status["dc002"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
-    health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
+    # health_status["dc003_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if pd.isna(x) else 0)
     health_status["dc019_score"] = health_status["dc019"].apply(lambda x : 1 if x==93 else 0 if pd.isna(x) else 0) 
     health_status["dc020_score"] = health_status["dc020"].apply(lambda x : 1 if x==86 else 0 if pd.isna(x) else 0) 
     health_status["dc021_score"] = health_status["dc021"].apply(lambda x : 1 if x==79 else 0 if pd.isna(x) else 0)
@@ -589,7 +589,7 @@ if __name__ == "__main__":
     health_status["draw_score"] = health_status["dc025"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
 
     data_2015["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
-        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
         health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
         health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \
@@ -723,12 +723,12 @@ if __name__ == "__main__":
 
     data_2018 = pd.merge(data_2018, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
 
-    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
     cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
     cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
     cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
     cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-    cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+    # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
     cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
     cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
     cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
@@ -762,7 +762,7 @@ if __name__ == "__main__":
     cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
 
     data_2018["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
-        cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
+        cognition["dc001s3_score"] + cognition["dc002_score"]+ \
         cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
         cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
         cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
@@ -906,7 +906,7 @@ if __name__ == "__main__":
     
     data_2020 = pd.merge(data_2020, health_status_select, on = ["ID", 'householdID', 'communityID'], how="left")
 
-    #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+    #计算认知功能得分,分成三部分:电话问卷9分,词语回忆10分、画图1分
     health_status["dc001s1_score"] = health_status["dc001"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
     health_status["dc001s2_score"] = health_status["dc005"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
     health_status["dc001s3_score"] = health_status["dc003"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
@@ -943,7 +943,7 @@ if __name__ == "__main__":
     health_status["draw_score"] = health_status["dc009"].apply(lambda x : 1 if x==1 else 0 if x==2 else np.nan)
 
     data_2020["Cognition_score"] = health_status["dc001s1_score"] + health_status["dc001s2_score"] + \
-        health_status["dc001s3_score"] + health_status["dc002_score"]+ health_status["dc003_score"]+ \
+        health_status["dc001s3_score"] + health_status["dc002_score"]+ \
         health_status["dc019_score"]+ health_status["dc020_score"] + health_status["dc021_score"]+ \
         health_status["dc022_score"]+ health_status["dc023_score"] + health_status["dc006s1_score"] + \
         health_status["dc006s2_score"] + health_status["dc006s3_score"] + health_status["dc006s4_score"] + \

+ 9 - 2
CHARLS_P/CHARLS_split.py

@@ -20,6 +20,11 @@ if __name__ == "__main__":
     disease_features = ['Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
                   'Liver_Disease', 'Heart_Problems', 'Stroke', 'Kidney_Diease','Stomach_or_Other_Digestive_Disease', 
                   'Emotional_Nervous_or_Psychiatric_Problems', 'Memory_Related_Disease','Arthritis_or_Rheumatism','Asthma']
+    
+    # 夜光暴露与空气污染对慢性非传染性疾病(高血压、心脏病、糖尿病、肥胖、中风、、关节炎、癌症和记忆相关疾病)的交互影响
+    one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]
+    one_data.to_csv("1.csv", index=False)
+    
     #夜光暴露与空气污染对老年人认知功能的交互影响及炎症和氧化应激的中介效应
     one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features+["Cognition_score", "Memory_Related_Disease"]]
     one_data.to_csv("2.csv", index=False)
@@ -65,10 +70,12 @@ if __name__ == "__main__":
     one_data.to_csv("12.csv", index=False)   
 
     # 13.夜光暴露与空气污染对睡眠障碍的交互影响及炎症和氧化应激的中介效应
-    
+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + blood_features + ["sleep_state"]]
+    one_data.to_csv("13.csv", index=False)
 
     # 14.基于贝叶斯网络的夜光暴露、空气污染与肥胖相关指标(BMI、锥度指数、相对脂肪质量指数等)对疾病发病风险的预测作用
-
+    one_data = data.loc[(data['wave'] == 2011) | (data['wave'] == 2013) | (data['wave'] == 2015), base_feature + pollutant_feature + nl_feature + ["Height", "Weight", "waist"]]
+    one_data.to_csv("14.csv", index=False)
 
     # 15.夜光与空气污染暴露变化轨迹(潜增长模型等)与疾病的关联性研究
     one_data = data.loc[:, base_feature + pollutant_feature + nl_feature + disease_features]

+ 3 - 2
CHARLS_P/chongqing_pm.py

@@ -3,7 +3,7 @@ from glob import glob
 import os
 
 def pollutant_chongqing_handle():
-    path = "result_O3"
+    path = "pollution/result_SO2"
     data = pd.read_csv(path+".csv")
     # 找到province列等于'重庆市'的行
     chongqing_rows = data[data['province'] == '重庆市']
@@ -35,4 +35,5 @@ def aba_chongqing_handle():
         df.to_csv(path+file_name+"_p"+extension, index=False)
 
 if __name__ == "__main__":
-    aba_chongqing_handle()
+    pollutant_chongqing_handle()
+    # aba_chongqing_handle()

+ 2 - 2
CHARLS_P/nc2geotiff.py

@@ -14,12 +14,12 @@ import concurrent.futures
 # from osgeo import osr
 
 #Define work and output paths
-WorkPath = r'/root/r_base/O3'
+WorkPath = r'/root/r_base/pollution/SO2'
 OutPath  = WorkPath
 
 #Define air pollutant type 
 #e.g., PM1, PM2.5, PM10, O3, NO2, SO2, and CO, et al.
-AP = 'O3'
+AP = 'SO2'
 
 #Define spatial resolution 
 #e.g., 1 km ≈ 0.01 Degree

+ 4 - 1
README.md

@@ -7,4 +7,7 @@ AreaCity-Query-Geometry中是对坐标数据进行处理的程序
 
 1. 什么是共病,如何定义共病,同时患有两种疾病以上就叫共病?
 2. 结局变量为分类变量,状态之间可以相互转化,共病的状态如何定义?
-3. 设置状态:健康状态:无任何共病症状的人群。单一疾病状态:患有某一种特定疾病的人群。可以分为不同的疾病类别,如心理疾病(如焦虑、抑郁)或生理疾病(如高血压、糖尿病)。多种疾病共存状态:同时患有两种或多种疾病的状态。例如,焦虑和失眠同时存在,或者糖尿病和心血管疾病共存。恶化状态:共病症状逐渐恶化,病情加重的状态。这一状态通常会与疾病的进展或夜光暴露程度相关。缓解状态:共病症状缓解,患者恢复部分或完全健康的状态。死亡状态:
+3. 设置状态:
+4. 健康状态:无任何共病症状的人群。
+5. 单一疾病状态:患有某一种特定疾病的人群。可以分为不同的疾病类别,如心理疾病(如焦虑、抑郁)或生理疾病(如高血压、糖尿病)。
+6. 多种疾病共存状态:同时患有两种或多种疾病的状态。例如,焦虑和失眠同时存在,或者糖尿病和心血管疾病共存。

+ 60 - 0
paper_code/code.R

@@ -0,0 +1,60 @@
+# install.packages("msm", repos = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
+
+library(msm)
+library(survival)
+
+# data <- data.frame(
+#   ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
+#   time = c(0, 1, 2, 0, 1, 2),      # 随访时间
+#   state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
+#   birth_year = c(1970, 1970, 1970, 1980, 1980, 1980),
+#   gender = c(1, 1, 1, 2, 2, 2),    # 性别
+#   education = c(3, 3, 3, 2, 2, 2)  # 教育程度
+# )
+# statetable.msm(state, ID, data = data)
+
+# qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
+#                          0.1, -0.3, 0.2,
+#                          0, 0, 0), 
+#                        nrow = 3, byrow = TRUE)
+
+# msm_model <- msm(state ~ time, subject = ID, data = data,
+#                  qmatrix = qmatrix_init, 
+#                  covariates = ~ gender + education)
+# pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
+# summary(msm_model)
+
+# 创建数据框
+data <- data.frame(
+  ID = c(1, 1, 1, 2, 2, 2),        # 人员ID
+  time = c(0, 1, 2, 0, 1, 2),      # 随访时间
+  state = c(1, 2, 3, 1, 1, 2),     # 疾病状态
+  birth_year = c(1970, 1970, 1970, 1980, 1980, 1980), # 出生年份
+  gender = c(1, 1, 1, 2, 2, 2),    # 性别
+  education = c(3, 3, 3, 2, 2, 2)  # 教育程度
+)
+
+# 计算状态转移频数表
+freq_table <- statetable.msm(state, ID, data = data)
+print(freq_table)
+
+# 初始化转移速率矩阵
+qmatrix_init <- matrix(c(-0.5, 0.25, 0.25,
+                          0.1, -0.3, 0.2,
+                          0.3, 0.2, -0.5), 
+                        nrow = 3, byrow = TRUE)
+
+# 创建初始模型
+crude_init <- crudeinits.msm(state ~ time+ gender, subject = ID, data = data, qmatrix = qmatrix_init)
+
+# 进行多状态模型分析
+msm_model <- msm(state ~ time, subject = ID, data = data,
+                 qmatrix = crude_init,
+                 covariates = ~ gender+education)
+
+# 计算状态转移概率矩阵
+prob_matrix <- pmatrix.msm(msm_model, t = 1)  # t = 1 代表随访之间的间隔时间
+print(prob_matrix)
+
+# 查看模型的详细结果
+summary(msm_model)

+ 73 - 56
test.py

@@ -75,59 +75,76 @@ import pandas as pd
 #     except Exception as e:
 #         print(f"Error reading {file_path}: {e}")
 # print(num)
-import pyreadstat
-import numpy as np
-
-year = "2018"
-cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
-#计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
-cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
-cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
-cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
-cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
-cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
-
-#词语记忆
-cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
-cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
-cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
-cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) 
-cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) 
-cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)                                            
-cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) 
-cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) 
-cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)                                            
-cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)                                           
-# cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
-cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) 
-cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) 
-cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) 
-cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) 
-cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) 
-cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)                                            
-cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) 
-cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) 
-cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)                                            
-cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)                                            
-# cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
-#画图
-cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
-
-cognition["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
-    cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
-    cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
-    cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
-    cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
-    cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
-    cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
-    cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
-    cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
-    cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
-    cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
-    cognition["draw_score"]
-cognition.to_csv("/root/r_base/CHARLS/test.csv")
+# import pyreadstat
+# import numpy as np
+
+# year = "2018"
+# cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
+# #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
+# cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+# cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+# cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+# cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+# cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+# cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1) 
+# cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1) 
+# cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
+# cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
+# cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
+
+# #词语记忆
+# cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
+# cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
+# cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
+# cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1) 
+# cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1) 
+# cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)                                            
+# cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1) 
+# cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1) 
+# cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)                                            
+# cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)                                           
+# # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0) 
+# cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1) 
+# cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1) 
+# cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1) 
+# cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1) 
+# cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1) 
+# cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)                                            
+# cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1) 
+# cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1) 
+# cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)                                            
+# cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)                                            
+# # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
+# #画图
+# cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
+
+# cognition["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
+#     cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
+#     cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
+#     cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
+#     cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
+#     cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
+#     cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
+#     cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
+#     cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
+#     cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
+#     cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
+#     cognition["draw_score"]
+# cognition.to_csv("/root/r_base/CHARLS/test.csv")
+
+import pandas as pd
+CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
+#合并
+merge_list = ["marital_status_m",	"Height_m",	"Weight_m",	"waist_m",	"Systolic_m",	"Diastolic_m",
+                "Physical_activity_m",	"Smoke_m",	'Drink_m',	'Hypertension_m',	'Disabetes_or_High_Blood_Sugar_m',
+                'Cancer_or_Malignant_Tumor_m',	'Chronic_Lung_Diseases_m',	'Heart_Problems_m',	'Emotional_Nervous_or_Psychiatric_Problems_m',
+                'Stroke_m',	'Arthritis_or_Rheumatism_m',	'Dyslipidemia_m',	'Liver_Disease_m',	'Kidney_Diease_m',	'Stomach_or_Other_Digestive_Disease_m',
+                'Asthma_m',	'Memory_Related_Disease_m',	'Psychiatric_score_m',	'sleep_state_m', 'Cognition_score_m']
+
+# 遍历 merge_list 列表
+for col_m in merge_list:
+    col = col_m.replace('_m', '')  # 去掉 '_m' 得到相应的列名
+    if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
+        CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
+
+CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h_test.csv")