Browse Source

修改一版

JazzZhao 4 weeks ago
parent
commit
cd06a433be

+ 1 - 0
CHARLS_P/CHARLS_NDVI.py

@@ -20,6 +20,7 @@ if __name__ == "__main__":
         table_merge = pd.merge(CHARLS_data_year, ndvi_data, left_on="city", right_on="CITY", how='left')
         # table_merge_last.to_csv("123.csv",index=False)
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_ndvi'] = table_merge[str(year)].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_ndvi'] = table_merge[str(year-1)].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_ndvi'] = table_merge[str(year-2)].values
         print(year)

+ 1 - 0
CHARLS_P/CHARLS_NL.py

@@ -15,6 +15,7 @@ for year in years:
     table_merge = pd.merge(CHARLS_data_year, pollutants_data, left_on="city", right_on="ext_name", how='left')
     # table_merge_last.to_csv("123.csv",index=False)
     #更新CHARLS表
+    CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_nl'] = table_merge[str(year)].values
     CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_nl'] = table_merge[str(year-1)].values
     CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_nl'] = table_merge[str(year-2)].values
     print(year)

+ 16 - 5
CHARLS_P/CHARLS_PM.py

@@ -5,7 +5,7 @@ import os
 def pollutant_handle(path):
     years = [2011, 2013,2015, 2018, 2020]
     #读取污染物数据
-    pollutants_data = pd.read_csv("pollution/result_O3_p.csv")
+    pollutants_data = pd.read_csv("pollution/result_pm10_1km_p.csv")
     for year in years:
         CHARLS_data = pd.read_csv(path)
         print(CHARLS_data.info())
@@ -13,12 +13,14 @@ def pollutant_handle(path):
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #两个表合并
         table_merge = pd.merge(CHARLS_data_year, pollutants_data, on=['province', 'city'], how='left')
-        if str(year - 1) in table_merge.columns:
+        if str(year) in table_merge.columns:
             #更新CHARLS表
-            CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_O3'] = table_merge[str(year-1)].values
+            CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_PM10'] = table_merge[str(year)].values
+        if str(year - 1) in table_merge.columns:
+            CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_PM10'] = table_merge[str(year-1)].values
         if str(year - 2) in table_merge.columns:
-            CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_O3'] = table_merge[str(year-2)].values
-        CHARLS_data.to_csv("CHARLS_data_pollutants.csv",index=False)
+            CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_PM10'] = table_merge[str(year-2)].values
+        CHARLS_data.to_csv(path ,index=False)
         print(year)
 
 def aba_handle(path_data):
@@ -27,12 +29,21 @@ def aba_handle(path_data):
         CHARLS_data = pd.read_csv(path_data)
         path = "aba627/result/"
         #读取污染物组分
+        cur_year_file_name = path+str(year)+"_PM25_and_species_p.csv"
         last_year_file_name = path+str(year-1)+"_PM25_and_species_p.csv"
         before_last_file_name = path+str(year-2)+"_PM25_and_species_p.csv"
+        cur_pollutants_data = pd.read_csv(cur_year_file_name)
         last_year_pollutants_data = pd.read_csv(last_year_file_name)
         before_last_pollutants_data = pd.read_csv(before_last_file_name)
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
+        #和当年的污染物组分文件合并
+        cur_table_merge = pd.merge(CHARLS_data_year, cur_pollutants_data, on=['province', 'city'], how='left')
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_SO4'] = cur_table_merge["SO4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_NO3'] = cur_table_merge["NO3"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_NH4'] = cur_table_merge["NH4"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_OM'] = cur_table_merge["OM"].values
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_BC'] = cur_table_merge["BC"].values
         #和上一年的污染物组分文件合并
         last_table_merge = pd.merge(CHARLS_data_year, last_year_pollutants_data, on=['province', 'city'], how='left')
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_SO4'] = last_table_merge["SO4"].values

+ 30 - 5
CHARLS_P/CHARLS_meteorology.py

@@ -4,6 +4,9 @@ def sunlight(CHARLS_data):
     years = [2011, 2013,2015, 2018, 2020]
     for year in years:
         #读取日照数据
+        sunlight_data_cur = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year}年】逐年日照.xlsx")
+        sunlight_data_cur = sunlight_data_cur[["城市", "累积日照"]]
+        sunlight_data_cur = sunlight_data_cur.rename(columns={"累积日照":"cur_sunlight"})
         sunlight_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年日照/【{year-1}年】逐年日照.xlsx")
         sunlight_data_last = sunlight_data_last[["城市", "累积日照"]]
         sunlight_data_last = sunlight_data_last.rename(columns={"累积日照":"last_sunlight"})
@@ -13,9 +16,11 @@ def sunlight(CHARLS_data):
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #合并日照
-        table_merge = pd.merge(CHARLS_data_year, sunlight_data_last, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(CHARLS_data_year, sunlight_data_cur, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(table_merge, sunlight_data_last, left_on="city", right_on="城市", how='left')
         table_merge = pd.merge(table_merge, sunlight_data_before_last, left_on="city", right_on="城市", how='left')
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_sunlight'] = table_merge['cur_sunlight'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_sunlight'] = table_merge['last_sunlight'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_sunlight'] = table_merge['before_sunlight'].values
         CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)
@@ -25,6 +30,9 @@ def wind(CHARLS_data):
     years = [2011, 2013,2015, 2018, 2020]
     for year in years:
         #读取日照数据
+        wind_data_cur = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year}年】逐年风速.xlsx")
+        wind_data_cur = wind_data_cur[["城市", "平均风速"]]
+        wind_data_cur = wind_data_cur.rename(columns={"平均风速":"cur_wind"})
         wind_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年风速/【{year-1}年】逐年风速.xlsx")
         wind_data_last = wind_data_last[["城市", "平均风速"]]
         wind_data_last = wind_data_last.rename(columns={"平均风速":"last_wind"})
@@ -34,9 +42,11 @@ def wind(CHARLS_data):
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #合并日照
-        table_merge = pd.merge(CHARLS_data_year, wind_data_last, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(CHARLS_data_year, wind_data_cur, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(table_merge, wind_data_last, left_on="city", right_on="城市", how='left')
         table_merge = pd.merge(table_merge, wind_data_before_last, left_on="city", right_on="城市", how='left')
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_wind'] = table_merge['cur_wind'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_wind'] = table_merge['last_wind'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_wind'] = table_merge['before_wind'].values
         CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)
@@ -46,6 +56,9 @@ def rain(CHARLS_data):
     years = [2011, 2013,2015, 2018, 2020]
     for year in years:
         #读取日照数据
+        rain_data_cur = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year}年】逐年降水.xlsx")
+        rain_data_cur = rain_data_cur[["城市", "累积降水"]]
+        rain_data_cur = rain_data_cur.rename(columns={"累积降水":"cur_rain"})
         rain_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年降水/【{year-1}年】逐年降水.xlsx")
         rain_data_last = rain_data_last[["城市", "累积降水"]]
         rain_data_last = rain_data_last.rename(columns={"累积降水":"last_rain"})
@@ -55,9 +68,11 @@ def rain(CHARLS_data):
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #合并日照
-        table_merge = pd.merge(CHARLS_data_year, rain_data_last, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(CHARLS_data_year, rain_data_cur, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(table_merge, rain_data_last, left_on="city", right_on="城市", how='left')
         table_merge = pd.merge(table_merge, rain_data_before_last, left_on="city", right_on="城市", how='left')
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_rain'] = table_merge['cur_rain'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_rain'] = table_merge['last_rain'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_rain'] = table_merge['before_rain'].values
         CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)
@@ -67,6 +82,9 @@ def temperature(CHARLS_data):
     years = [2011, 2013,2015, 2018, 2020]
     for year in years:
         #读取日照数据
+        temperature_data_cur = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year}年】逐年气温.xlsx")
+        temperature_data_cur = temperature_data_cur[["城市", "平均气温"]]
+        temperature_data_cur = temperature_data_cur.rename(columns={"平均气温":"cur_temperature"})
         temperature_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年气温/【{year-1}年】逐年气温.xlsx")
         temperature_data_last = temperature_data_last[["城市", "平均气温"]]
         temperature_data_last = temperature_data_last.rename(columns={"平均气温":"last_temperature"})
@@ -76,9 +94,11 @@ def temperature(CHARLS_data):
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #合并日照
-        table_merge = pd.merge(CHARLS_data_year, temperature_data_last, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(CHARLS_data_year, temperature_data_cur, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(table_merge, temperature_data_last, left_on="city", right_on="城市", how='left')
         table_merge = pd.merge(table_merge, temperature_data_before_last, left_on="city", right_on="城市", how='left')
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_temperature'] = table_merge['cur_temperature'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_temperature'] = table_merge['last_temperature'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_temperature'] = table_merge['before_temperature'].values
         CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)
@@ -88,6 +108,9 @@ def humidity(CHARLS_data):
     years = [2011, 2013,2015, 2018, 2020]
     for year in years:
         #读取日照数据
+        humidity_data_cur = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year}年】逐年湿度.xlsx")
+        humidity_data_cur = humidity_data_cur[["城市", "平均湿度"]]
+        humidity_data_cur = humidity_data_cur.rename(columns={"平均湿度":"cur_humidity"})
         humidity_data_last = pd.read_excel(f"气象数据/2000-2020年按年逐年湿度/【{year-1}年】逐年湿度.xlsx")
         humidity_data_last = humidity_data_last[["城市", "平均湿度"]]
         humidity_data_last = humidity_data_last.rename(columns={"平均湿度":"last_humidity"})
@@ -97,9 +120,11 @@ def humidity(CHARLS_data):
         #开始筛选出year的数据
         CHARLS_data_year = CHARLS_data[CHARLS_data['wave']==year]
         #合并日照
-        table_merge = pd.merge(CHARLS_data_year, humidity_data_last, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(CHARLS_data_year, humidity_data_cur, left_on="city", right_on="城市", how='left')
+        table_merge = pd.merge(table_merge, humidity_data_last, left_on="city", right_on="城市", how='left')
         table_merge = pd.merge(table_merge, humidity_data_before_last, left_on="city", right_on="城市", how='left')
         #更新CHARLS表
+        CHARLS_data.loc[CHARLS_data['wave']==year, 'cur_year_humidity'] = table_merge['cur_humidity'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'last_year_humidity'] = table_merge['last_humidity'].values
         CHARLS_data.loc[CHARLS_data['wave']==year, 'before_last_humidity'] = table_merge['before_humidity'].values
         CHARLS_data.to_csv("CHARLS_data_p_n_m.csv",index=False)

+ 30 - 5
paper_code/code.R

@@ -4,7 +4,7 @@ library(msm)
 library(survival)
 library(dplyr)
 
-data <- read.csv("paper_data.csv")
+data <- read.csv("paper_data_new.csv")
 
 # 定义一个函数
 feature_function <- function(data) {
@@ -101,13 +101,16 @@ output <- one_function(data, crude_init)
 
 # 多协变量的分析,全部
 # 定义协变量列表
-covariates_list <- c("last_year_NO3", "last_year_SO4", "last_year_NH4", "last_year_OM")
-
+# covariates_list <- c("last_year_SO4", "last_year_NO3", "last_year_BC", "last_year_NH4", "last_year_OM")
+covariates_list <- c("cur_year_SO4", "cur_year_NO3", "cur_year_BC", "cur_year_NH4", "cur_year_OM")
+# covariates_list <- c("before_last_SO4", "before_last_NO3", "before_last_BC", "before_last_NH4", "before_last_OM")
 # 循环计算不同协变量的模型
 for (cov in covariates_list) {
   print(cov)
   # 动态生成协变量的公式
-  cov_formula <- paste("~", cov, "+rgender_group+age_group+marital_group+education_group+activity_group+psychiatric_group+BMI_group+ADL_group+Smoke_group+Drink_group+last_year_NO2+last_year_O3+last_year_PM1+last_year_nl")
+  # cov_formula <- paste("~", cov, "+rgender_group+age_group+marital_group+education_group+activity_group+psychiatric_group+BMI_group+ADL_group+Smoke_group+Drink_group+last_year_NO2+last_year_O3+last_year_PM1+last_year_nl")
+  cov_formula <- paste("~", cov, "+rgender_group+age_group+marital_group+education_group+activity_group+psychiatric_group+BMI_group+ADL_group+Smoke_group+Drink_group+cur_year_NO2+cur_year_O3+cur_year_PM1+cur_year_nl")
+  # cov_formula <- paste("~", cov, "+rgender_group+age_group+marital_group+education_group+activity_group+psychiatric_group+BMI_group+ADL_group+Smoke_group+Drink_group+before_last_NO2+before_last_O3+before_last_PM1+before_last_nl")
   # 进行多状态模型分析
   msm_model <- msm(state ~ wave, subject = ID, data = data,
                   qmatrix = crude_init,
@@ -127,6 +130,13 @@ for (cov in covariates_list) {
 # 查看模型的详细结果
 summary(msm_model)
 
+# 去掉协变量
+msm_model <- msm(state ~ wave, subject = ID, data = data,
+                qmatrix = crude_init,
+                death = 4,
+                method = "BFGS", control = list(fnscale = 5000, maxit = 10000)
+                )
+
 # 计算状态转移概率矩阵
 prob_matrix <- pmatrix.msm(msm_model, t = 5)  # t = 1 代表随访之间的间隔时间
 print(prob_matrix)
@@ -148,7 +158,8 @@ print(so_journ)
 
 #单个协变量的分析
 # 定义协变量列表
-covariates_list <- list(~age_group,~rgender_group,~marital_group,~education_group,~activity_group,~psychiatric_group,~BMI_group,~ADL_group, ~Smoke,~Drink,~last_year_NO2, ~last_year_O3, ~last_year_PM1, ~last_year_PM2.5, ~last_year_PM10, ~last_year_SO4,~last_year_NO3,~last_year_NH4,~last_year_OM,~last_year_BC, ~last_year_nl)
+covariates_list <- list(~age_group,~rgender_group,~marital_group,~education_group,~activity_group,~psychiatric_group,~BMI_group,~ADL_group, ~Smoke,~Drink,~last_year_NO2, ~last_year_O3, ~last_year_PM1, ~last_year_PM2.5, ~last_year_PM10, ~last_year_SO4,~last_year_NO3,~last_year_NH4,~last_year_OM,~last_year_BC, ~last_year_nl,
+~cur_year_NO2, ~cur_year_O3, ~cur_year_PM1, ~cur_year_PM2.5, ~cur_year_PM10, ~cur_year_SO4,~cur_year_NO3,~cur_year_NH4,~cur_year_OM,~cur_year_BC, ~cur_year_nl)
 
 # 创建一个空的结果文件
 result_file <- "msm_model_results.txt"
@@ -175,3 +186,17 @@ for (cov in covariates_list) {
 }
 
 
+library(dplyr)
+time_check <- data %>%
+  group_by(ID) %>%
+  arrange(wave) %>%
+  mutate(
+    last_year_pollution = lag(cur_year_SO4), # 用当前年变量生成实际的上一年数据
+    time_diff = wave - lag(wave)
+  ) %>%
+  select(ID, wave, time_diff, cur_year_SO4, last_year_SO4, last_year_pollution) %>%
+  filter(!is.na(last_year_SO4))
+
+# 检查系统是否自动对齐
+cat("时间错位比例:", mean(time_check$last_year_SO4 != time_check$last_year_pollution, na.rm=T)*100, "%\n")
+# 若输出>0%,说明存在错位记录

+ 6 - 5
paper_code/data_preprocess.py

@@ -7,10 +7,11 @@ from sklearn.impute import IterativeImputer
 if __name__ == "__main__":
     CHARLS_data = pd.read_csv("CHARLS_data_p_n_m_nd_h.csv")
 
-    cavariates = ["last_year_NO2", 	"before_last_NO2", 	"last_year_O3", "before_last_O3", 
-                        "last_year_PM1",	"before_last_PM1",	"last_year_PM2.5",	"before_last_PM2.5",	"last_year_PM10",
-                    	'before_last_PM10',	'last_year_SO4',	'last_year_NO3',	'last_year_NH4',	'last_year_OM',	'last_year_BC',	'before_last_SO4',
-                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'last_year_nl',	'before_last_nl']
+    cavariates = ["cur_year_NO2","last_year_NO2", 	"before_last_NO2", 	"cur_year_O3", "last_year_O3", "before_last_O3", 
+                        "cur_year_PM1", "last_year_PM1",	"before_last_PM1",	"cur_year_PM2.5", "last_year_PM2.5",	"before_last_PM2.5",	"cur_year_PM10", "last_year_PM10",
+                    	'before_last_PM10',	'cur_year_SO4', 'cur_year_NO3', 'cur_year_NH4', 'cur_year_OM', 'cur_year_BC',
+                        'last_year_SO4', 'last_year_NO3',	'last_year_NH4',	'last_year_OM',	'last_year_BC',	'before_last_SO4',
+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'cur_year_nl', 'last_year_nl',	'before_last_nl']
     #挑出需要的字段
     data = CHARLS_data[["ID", "rgender", "age", "marital_status", "education", "Physical_activity", "Psychiatric_score", "BMI", "ADL", "Smoke", "Drink",
                         'Hypertension','Dyslipidemia','Disabetes_or_High_Blood_Sugar','Cancer_or_Malignant_Tumor','Chronic_Lung_Diseases', 
@@ -183,5 +184,5 @@ if __name__ == "__main__":
     data[data < 0] = 0
 
     #排序将ID相同的放到一起
-    data.to_csv("paper_data.csv", index=False)
+    data.to_csv("paper_data_new.csv", index=False)
     

+ 42 - 5
paper_code/figure.r

@@ -5,7 +5,7 @@
 library(compareGroups)
 library(dplyr)
 
-data <- read.csv("paper_data.csv")
+data <- read.csv("paper_data_new.csv")
 data <- subset(data, wave == 2011)
 
 #性别
@@ -183,10 +183,10 @@ ggsave("forestplot_output.png", plot = p, width = 35, height = 30, dpi = 300, bg
 library(forestploter)
 library(grid) # 加载 grid 包
 # dt <- read.csv(system.file("extdata", "example_data.csv", package = "forestploter"))
-dt <- read.csv("PM1+BC.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
+dt <- read.csv("PM1+BC_C.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
 # 下面的数据处理方式和前面的几乎一样
 # 如果Placebo列中不是NA,缩进子组
-dt$Subgroup <- ifelse(is.na(dt$num), 
+dt$Subgroup <- ifelse(is.na(dt$num),  
                       dt$Subgroup,
                       paste0("··", dt$Subgroup, ""))  # 加粗并缩进
           
@@ -196,8 +196,45 @@ dt$Subgroup <- ifelse(is.na(dt$num),
 # dt$n2 <- ifelse(is.na(dt$Placebo), "", dt$Placebo)
 
 # 为CI添加两个空白列
+# 创建格式化字符串列(关键修改部分)
+dt$`Deteriorate Estimate (95% CI)` <- paste(
+  ifelse(!is.na(dt$est_gp1),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp1, dt$low_gp1, dt$hi_gp1),
+         ""),
+  ifelse(!is.na(dt$est_gp2),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp2, dt$low_gp2, dt$hi_gp2),
+         ""),
+  ifelse(!is.na(dt$est_gp3),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp3, dt$low_gp3, dt$hi_gp3),
+         ""),
+  sep = "\n"
+)
 dt$`Deteriorate transition` <- paste(rep(" ", 25), collapse = " ")
+dt$`Recovery Estimate (95% CI)` <- paste(
+  ifelse(!is.na(dt$est_gp4),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp4, dt$low_gp4, dt$hi_gp4),
+         ""),
+  ifelse(!is.na(dt$est_gp5),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp5, dt$low_gp5, dt$hi_gp5),
+         ""),
+  ifelse(!is.na(dt$est_gp6),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp6, dt$low_gp6, dt$hi_gp6),
+         ""),
+  sep = "\n"
+)
 dt$`Recovery transition` <- paste(rep(" ", 25), collapse = " ")
+dt$`Mortality Estimate (95% CI)` <- paste(
+  ifelse(!is.na(dt$est_gp7),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp7, dt$low_gp7, dt$hi_gp7),
+         ""),
+  ifelse(!is.na(dt$est_gp8),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp8, dt$low_gp8, dt$hi_gp8),
+         ""),
+  ifelse(!is.na(dt$est_gp9),
+         sprintf("%.2f (%.2f-%.2f)", dt$est_gp9, dt$low_gp9, dt$hi_gp9),
+         ""),
+  sep = "\n"
+)
 dt$`Mortality transition` <- paste(rep(" ", 25), collapse = " ")
 
 # 设置森林图主题
@@ -222,7 +259,7 @@ tm <- forest_theme(
 )
 # 绘制森林图
 p <- forest(
-  dt[,c(1, 30, 31, 32)],  # 选择要在森林图中使用的数据列
+  dt[,c(1, 30, 31, 32,33,34,35)],  # 选择要在森林图中使用的数据列
   est = list(
     dt$est_gp1, dt$est_gp4, dt$est_gp7,
     dt$est_gp2, dt$est_gp5, dt$est_gp8,
@@ -235,7 +272,7 @@ p <- forest(
     dt$hi_gp1, dt$hi_gp4, dt$hi_gp7,
     dt$hi_gp2, dt$hi_gp5, dt$hi_gp8,
     dt$hi_gp3, dt$hi_gp6, dt$hi_gp9),
-  ci_column = c(2, 3, 4),         # 指定CI列
+  ci_column = c(3, 5, 7),         # 指定CI列
   ref_line = 1,                # 添加参考线
   vert_line = c(0.5, 2),       # 添加垂直线
   nudge_y = 0.2,               # 垂直调整标签位置

+ 11 - 9
paper_code/vif.py

@@ -4,19 +4,21 @@ from statsmodels.stats.outliers_influence import variance_inflation_factor
 from sklearn.preprocessing import StandardScaler
 
 # 实际数据
-df = pd.read_csv("paper_data.csv")
+df = pd.read_csv("paper_data_new.csv")
 
-df = df.drop(columns=["before_last_NO2", "before_last_O3", "last_year_pm2.5","last_year_pm10",
-                        "before_last_pm1",	"before_last_pm2.5","last_year_SO4","last_year_BC",
-                    	'before_last_pm10',	'before_last_SO4',"last_year_OM",'last_year_NH4',
-                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'before_last_nl'])
+df = df.drop(columns=["before_last_NO2", "before_last_O3", "last_year_PM2.5","last_year_PM10",
+                        "before_last_PM1",	"before_last_PM2.5","last_year_SO4","last_year_BC",
+                    	'before_last_PM10',	'before_last_SO4',"last_year_OM",'last_year_NH4',
+                    	'before_last_NO3',	'before_last_NH4',	'before_last_OM',	'before_last_BC',	'before_last_nl',
+                        'last_year_NO2', 'last_year_O3', 'last_year_PM1', 'last_year_NO3', 'last_year_nl', 'cur_year_PM2.5', 'cur_year_PM10','cur_year_SO4',
+                        'cur_year_BC', 'cur_year_NO3', 'cur_year_NH4'])
 # 标准化交互所需的列
-scaler = StandardScaler()
-columns_to_standardize = ["last_year_O3", "last_year_nl"]
-standardized_data = scaler.fit_transform(df[columns_to_standardize])
+# scaler = StandardScaler()
+# columns_to_standardize = ["last_year_O3", "last_year_nl"]
+# standardized_data = scaler.fit_transform(df[columns_to_standardize])
 
 # 计算交互项,使用标准化后的值
-df["nl_pm1"] = standardized_data[:, 1] * standardized_data[:, 0]  # 标准化后计算交互项
+# df["nl_PM1"] = standardized_data[:, 1] * standardized_data[:, 0]  # 标准化后计算交互项
 
 # 计算每个变量的 VIF方差膨胀因子
 def calculate_vif(df):

+ 8 - 14
time.md

@@ -1,20 +1,13 @@
 # 番茄时间
 ***
 
-5. 大模型面试50问
-6. 自己动手学深度学习
+1. 自来水导航和文本客服的统计报表需求
+2. 中移电话反诈相关测试,完成基于语义的分类模型训练和测试,并统计分类准确率
+3. 根据所投期刊要求,完成“基于检索增强的电力客服领域语言模型优化方法”论文的修改
+4. 根据所投期刊要求, 完成“基于Pipeline框架的智能对话系统交互优化”论文的修改
 
-4. 介绍一下当前主流的大模型结构是如何组成的?
-5. 介绍一下当前的GPT大模型的结构包括哪些部分?
-6. 介绍下大模型的多头注意力机制
-7. 什么是大模型MOE结构
-8. 知识蒸馏的步骤是什么
-9. 知识蒸馏中的教师模型和学生模型有什么区别
-10. 什么是分组混合并行训练方法
-11. 混合专家MOE的基本原理是什么
-
-
-12. 看transformer代码
+easytier调整好
+transformer
 
 
 docker run -itd -p 17860:7860 -p 18022:22 -v /ai_home/zhaojingteng:/root/zhaojingteng --name=zjt_chat --gpus='"device=5,6,7"' nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
@@ -64,4 +57,5 @@ curl -v google.com
 [Service]
 Environment="HTTP_PROXY=http://10.126.126.2:12334"
 Environment="HTTPS_PROXY=http://10.126.126.2:12334"
-Environment="NO_PROXY=localhost,127.0.0.1,.example.com"
+Environment="NO_PROXY=localhost,127.0.0.1,.example.com"
+

+ 310 - 0
transformer.py

@@ -0,0 +1,310 @@
+import torch
+import torch.nn as nn
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import ParameterGrid
+import matplotlib
+import seaborn as sns
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from scipy.stats import shapiro
+from statsmodels.graphics.tsaplots import plot_acf
+from scipy.stats import probplot
+from statsmodels.tsa.stattools import adfuller
+
+# 设置中文字体
+matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']  # Windows下的常见中文字体
+matplotlib.rcParams['axes.unicode_minus'] = False  # 正常显示负号
+
+# 数据预处理
+data = pd.read_excel('./daily_flu_counts.xlsx')
+data = data[['日期', '发病数']].sort_values('日期').fillna(0)
+
+
+# 使用线性插值填充缺失值
+data['发病数'] = data['发病数'].interpolate(method='linear')
+# 确保数据非负
+data['发病数'] = data['发病数'].clip(lower=0)
+# 确保数据非负
+data['发病数'] = data['发病数'].clip(lower=0)
+# 确保 '日期' 列为 datetime 类型
+data['日期'] = pd.to_datetime(data['日期'])
+
+# 绘制原数据的曲线图
+plt.figure(figsize=(12, 6))
+plt.plot(data['日期'], data['发病数'], label='发病数', color='blue')
+
+
+plt.title('每日发病数曲线图')
+plt.xlabel('日期')
+plt.ylabel('发病数')
+plt.legend()
+plt.grid(True)
+plt.xticks(rotation=45)
+plt.show()
+
+# 检查是否有可用的 GPU
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+print(f'Using device: {device}')
+
+# 读取数据
+data = pd.read_excel('./daily_flu_counts.xlsx')
+data = data[['日期', '发病数']].sort_values('日期').fillna(0)
+
+# 标准化数据
+scaler = StandardScaler()
+scaled_data = scaler.fit_transform(data['发病数'].values.reshape(-1, 1))
+
+# 创建时间序列数据集
+def create_dataset(dataset, look_back=1):
+    X, Y = [], []
+    for i in range(len(dataset) - look_back):
+        X.append(dataset[i:(i + look_back), 0])
+        Y.append(dataset[i + look_back, 0])
+    return np.array(X), np.array(Y)
+
+look_back = 30  # 滑动窗口长度
+X, Y = create_dataset(scaled_data, look_back)
+
+# 拆分训练集和测试集
+train_size = int(len(X) * 0.8)
+X_train, X_test = X[:train_size], X[train_size:]
+Y_train, Y_test = Y[:train_size], Y[train_size:]
+
+# 转换为 PyTorch 张量
+X_train = torch.tensor(X_train, dtype=torch.float32).view(-1, look_back, 1).to(device)
+X_test = torch.tensor(X_test, dtype=torch.float32).view(-1, look_back, 1).to(device)
+Y_train = torch.tensor(Y_train, dtype=torch.float32).view(-1, 1).to(device)
+Y_test = torch.tensor(Y_test, dtype=torch.float32).view(-1, 1).to(device)
+
+# 定义位置编码
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
+        
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        
+        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
+        self.register_buffer('pe', pe)
+    
+    def forward(self, x):
+        """
+        x: Tensor, shape [batch_size, seq_len, d_model]
+        """
+        x = x + self.pe[:, :x.size(1), :]
+        return x
+
+# 定义 Transformer 模型
+class EnhancedTransformerModel(nn.Module):
+    def __init__(self, input_size=1, d_model=64, nhead=8, num_encoder_layers=3, dim_feedforward=256, output_size=1, dropout=0.1):
+        super(EnhancedTransformerModel, self).__init__()
+        self.input_linear = nn.Linear(input_size, d_model)
+        self.positional_encoding = PositionalEncoding(d_model)
+        
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, 
+                                                   dim_feedforward=dim_feedforward, dropout=dropout)
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
+        
+        self.fc1 = nn.Linear(d_model, 64)
+        self.relu = nn.ReLU()
+        self.fc2 = nn.Linear(64, output_size)
+    
+    def forward(self, x):
+        """
+        x: Tensor, shape [batch_size, seq_len, input_size]
+        """
+        x = self.input_linear(x)  # [batch_size, seq_len, d_model]
+        x = self.positional_encoding(x)  # 添加位置编码
+        x = x.transpose(0, 1)  # [seq_len, batch_size, d_model]
+        x = self.transformer_encoder(x)  # [seq_len, batch_size, d_model]
+        x = x.mean(dim=0)  # [batch_size, d_model]
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+# 超参数搜索的网格(移除 batch_size)
+param_grid = {
+    'd_model': [64, 128],
+    'nhead': [4, 8],
+    'num_encoder_layers': [2, 3],
+    'dim_feedforward': [256, 512],
+    'learning_rate': [0.0001, 0.0005, 0.001]
+}
+
+# 评估函数(移除 batch_size)
+def evaluate_model(d_model, nhead, num_encoder_layers, dim_feedforward, learning_rate):
+    # 初始化模型
+    model = EnhancedTransformerModel(
+        input_size=1, 
+        d_model=d_model, 
+        nhead=nhead, 
+        num_encoder_layers=num_encoder_layers, 
+        dim_feedforward=dim_feedforward
+    ).to(device)
+    loss_function = nn.MSELoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # 训练模型
+    epochs = 100
+    model.train()
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+        y_pred = model(X_train)
+        loss = loss_function(y_pred, Y_train)
+        loss.backward()
+        optimizer.step()
+
+    # 预测和计算损失
+    model.eval()
+    with torch.no_grad():
+        test_predictions = model(X_test).cpu().numpy()
+    test_predictions_scaled = scaler.inverse_transform(test_predictions)
+    Y_test_scaled = scaler.inverse_transform(Y_test.cpu().numpy())
+    mse = mean_squared_error(Y_test_scaled, test_predictions_scaled)
+    
+    return mse
+
+# 网格搜索
+best_params = None
+best_mse = float('inf')
+
+for params in ParameterGrid(param_grid):
+    print(f'Evaluating with parameters: {params}')
+    mse = evaluate_model(**params)
+    print(f'MSE: {mse}')
+    
+    if mse < best_mse:
+        best_mse = mse
+        best_params = params
+
+print(f'Best parameters: {best_params}')
+print(f'Best MSE: {best_mse}')
+
+# 使用最佳参数实例化模型
+model = EnhancedTransformerModel(
+    input_size=1, 
+    d_model=best_params['d_model'], 
+    nhead=best_params['nhead'], 
+    num_encoder_layers=best_params['num_encoder_layers'], 
+    dim_feedforward=best_params['dim_feedforward']
+).to(device)
+
+# 定义损失函数和优化器
+loss_function = nn.MSELoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'])
+
+# 训练模型
+epochs = 500
+train_losses = []
+for epoch in range(epochs):
+    model.train()
+    optimizer.zero_grad()
+    y_pred = model(X_train)
+    loss = loss_function(y_pred, Y_train)
+    loss.backward()
+    optimizer.step()
+
+    train_losses.append(loss.item())
+    if epoch % 50 == 0:
+        print(f'Epoch {epoch}, Loss: {loss.item()}')
+
+# 绘制训练损失
+plt.plot(train_losses, label="训练损失")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.legend()
+plt.show()
+
+# 测试集预测
+model.eval()
+with torch.no_grad():
+    test_predictions = model(X_test).cpu().numpy()
+
+# 逆归一化
+test_predictions_scaled = scaler.inverse_transform(test_predictions)
+Y_test_scaled = scaler.inverse_transform(Y_test.cpu().numpy())
+
+# 确保预测值不小于0
+test_predictions_scaled = np.clip(test_predictions_scaled, 0, None)
+
+# 计算残差并中心化
+residuals = Y_test_scaled.flatten() - test_predictions_scaled.flatten()
+residuals_centered = residuals - residuals.mean()
+
+# Bootstrap参数
+n_bootstrap = 1000  # 重采样次数
+bootstrap_predictions = []
+
+np.random.seed(42)  # 为了结果可复现
+
+for _ in range(n_bootstrap):
+    # 从中心化的残差中有放回地采样
+    sampled_residuals = np.random.choice(residuals_centered, size=len(residuals_centered), replace=True)
+    # 生成新的预测样本
+    bootstrap_pred = test_predictions_scaled.flatten() + sampled_residuals
+    # 确保预测值不小于0
+    bootstrap_pred = np.clip(bootstrap_pred, 0, None)
+    bootstrap_predictions.append(bootstrap_pred)
+
+bootstrap_predictions = np.array(bootstrap_predictions)
+
+# 计算置信区间(例如 95%)
+lower_percentile = 2.5
+upper_percentile = 97.5
+ci_lower = np.percentile(bootstrap_predictions, lower_percentile, axis=0)
+ci_upper = np.percentile(bootstrap_predictions, upper_percentile, axis=0)
+
+
+# 预测未来的步骤数(例如 30 天)
+future_steps = 30
+future_predictions = []
+last_sequence = scaled_data[-look_back:].reshape(1, look_back, 1)
+last_sequence = torch.tensor(last_sequence, dtype=torch.float32).to(device)
+
+model.eval()
+with torch.no_grad():
+    for _ in range(future_steps):
+        pred_scaled = model(last_sequence).cpu().numpy()
+        future_predictions.append(pred_scaled.flatten()[0])
+        # 更新序列:移除第一个元素,添加新的预测
+        new_sequence = np.append(last_sequence.cpu().numpy().flatten()[1:], pred_scaled)
+        last_sequence = torch.tensor(new_sequence.reshape(1, look_back, 1), dtype=torch.float32).to(device)
+
+# 逆归一化
+future_predictions_scaled = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1)).flatten()
+future_predictions_scaled = np.clip(future_predictions_scaled, 0, None)
+
+# Bootstrap 未来预测置信区间
+bootstrap_future_predictions = []
+
+for _ in range(n_bootstrap):
+    # 假设未来预测的残差分布与测试集相似
+    sampled_residuals = np.random.choice(residuals_centered, size=future_steps, replace=True)
+    bootstrap_future_pred = future_predictions_scaled + sampled_residuals
+    # 确保预测值不小于0
+    bootstrap_future_pred = np.clip(bootstrap_future_pred, 0, None)
+    bootstrap_future_predictions.append(bootstrap_future_pred)
+
+bootstrap_future_predictions = np.array(bootstrap_future_predictions)
+
+# 计算置信区间(例如 95%)
+ci_lower_future = np.percentile(bootstrap_future_predictions, lower_percentile, axis=0)
+ci_upper_future = np.percentile(bootstrap_future_predictions, upper_percentile, axis=0)
+
+# 生成未来的日期
+last_date = data['日期'].iloc[-1]
+
+# 确保 last_date 是 pandas Timestamp 对象
+if not isinstance(last_date, pd.Timestamp):
+    last_date = pd.to_datetime(last_date)
+
+# 使用 start 和 periods 生成未来的日期,不使用 closed 参数
+future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=future_steps)