test.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. import pandas as pd
  2. # data = pd.read_csv("CLHLS/clhls_1998_2018_result.csv")
  3. # print(data.shape)
  4. # data = pd.read_csv("HRS/result_all.csv")
  5. # print(data.shape)
  6. # # 去重并统计ID个数
  7. # unique_ids = data.drop_duplicates(subset=["HHID", "PN"])
  8. # count_unique_ids = unique_ids.count()
  9. # print(count_unique_ids)
  10. # data = pd.read_csv("/root/r_base/UKDA-5050-stata/result_all.csv")
  11. # print(data.shape)
  12. # # 去重并统计ID个数
  13. # unique_ids = data.drop_duplicates(subset=["id"])
  14. # count_unique_ids = unique_ids.count()
  15. # print(count_unique_ids)
  16. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_data_eul_v1.dta', convert_categoricals=False)
  17. # print(df.shape)
  18. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_elsa_pensiongrid_eul_v2.dta', convert_categoricals=False)
  19. # print(df.shape)
  20. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_financial_derived_variables.dta', convert_categoricals=False)
  21. # print(df.shape)
  22. # df = pd.read_stata('/root/r_base/UKDA-5050-stata/stata/stata13_se/wave_9_ifs_derived_variables.dta', convert_categoricals=False)
  23. # print(df.shape)
  24. # 指定文件夹路径
  25. # import glob
  26. # import os
  27. # folder_path = '/root/r_base/NHANES/2017-2018'
  28. # # 获取所有 .xpt 文件的路径
  29. # xpt_files = glob.glob(os.path.join(folder_path, '*.XPT'))
  30. # num = 0
  31. # # 读取并处理每一个 .xpt 文件
  32. # for file_path in xpt_files:
  33. # try:
  34. # # 使用 pandas 读取 .xpt 文件
  35. # df = pd.read_sas(file_path, format='xport')
  36. # # 输出数据框的前几行以进行检查
  37. # print(f"Data from {file_path}:")
  38. # print(df.shape)
  39. # num += df.shape[1]
  40. # except Exception as e:
  41. # print(f"Error reading {file_path}: {e}")
  42. # print(num)
  43. # data = pd.read_csv("/root/r_base/CHARLS/result_all_new.csv")
  44. # print(data.shape)
  45. # # # 去重并统计ID个数
  46. # unique_ids = data.drop_duplicates(subset=["ID","communityID"])
  47. # count_unique_ids = unique_ids.count()
  48. # print(count_unique_ids)
  49. # 指定文件夹路径
  50. # import glob
  51. # import os
  52. # folder_path = '/root/r_base/CHARLS/CHARLS2018'
  53. # # 获取所有 .xpt 文件的路径
  54. # xpt_files = glob.glob(os.path.join(folder_path, '*.dta'))
  55. # num = 0
  56. # # 读取并处理每一个 .xpt 文件
  57. # for file_path in xpt_files:
  58. # try:
  59. # # 使用 pandas 读取 .xpt 文件
  60. # df = pd.read_stata(file_path)
  61. # # 输出数据框的前几行以进行检查
  62. # print(f"Data from {file_path}:")
  63. # print(df.shape)
  64. # num += df.shape[1]
  65. # except Exception as e:
  66. # print(f"Error reading {file_path}: {e}")
  67. # print(num)
  68. # import pyreadstat
  69. # import numpy as np
  70. # year = "2018"
  71. # cognition, meta = pyreadstat.read_dta("/root/r_base/CHARLS/CHARLS"+year+"/Cognition.dta")
  72. # #计算认知功能得分,分成三部分:电话问卷10分,词语回忆10分、画图1分
  73. # cognition["dc001s1_score"] = cognition["dc001_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  74. # cognition["dc001s2_score"] = cognition["dc006_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  75. # cognition["dc001s3_score"] = cognition["dc003_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  76. # cognition["dc002_score"] = cognition["dc005_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  77. # cognition["dc003_score"] = cognition["dc002_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  78. # cognition["dc019_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_1"]==97 else 1 if pd.isna(x["dc014_w4_1"]) and x["dc014_w4_1_1"]==93 else 0 if pd.isna(x["dc014_w4_1"]) and (not x["dc014_w4_1_1"]==93) else np.nan, axis=1)
  79. # cognition["dc020_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_2"]==97 else 1 if pd.isna(x["dc014_w4_2"]) and x["dc014_w4_2_1"]==86 else 0 if pd.isna(x["dc014_w4_2"]) and (not x["dc014_w4_2_1"]==86) else np.nan, axis=1)
  80. # cognition["dc021_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_3"]==97 else 1 if pd.isna(x["dc014_w4_3"]) and x["dc014_w4_3_1"]==79 else 0 if pd.isna(x["dc014_w4_3"]) and (not x["dc014_w4_3_1"]==79) else np.nan, axis=1)
  81. # cognition["dc022_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_4"]==97 else 1 if pd.isna(x["dc014_w4_4"]) and x["dc014_w4_4_1"]==72 else 0 if pd.isna(x["dc014_w4_4"]) and (not x["dc014_w4_4_1"]==72) else np.nan, axis=1)
  82. # cognition["dc023_score"] = cognition.apply(lambda x : 0 if x["dc014_w4_5"]==97 else 1 if pd.isna(x["dc014_w4_5"]) and x["dc014_w4_5_1"]==65 else 0 if pd.isna(x["dc014_w4_5"]) and (not x["dc014_w4_5_1"]==65) else np.nan, axis=1)
  83. # #词语记忆
  84. # cognition["dc006s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s1"]==1 else 0, axis=1)
  85. # cognition["dc006s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s2"]==2 else 0, axis=1)
  86. # cognition["dc006s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s3"]==3 else 0, axis=1)
  87. # cognition["dc006s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s4"]==4 else 0, axis=1)
  88. # cognition["dc006s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s5"]==5 else 0, axis=1)
  89. # cognition["dc006s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s6"]==6 else 0, axis=1)
  90. # cognition["dc006s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s7"]==7 else 0, axis=1)
  91. # cognition["dc006s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s8"]==8 else 0, axis=1)
  92. # cognition["dc006s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s9"]==9 else 0, axis=1)
  93. # cognition["dc006s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc028_w4_s10"]==10 else 0, axis=1)
  94. # # cognition["dc006s11_score"] = cognition["dc028_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  95. # cognition["dc027s1_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s1"]==1 else 0, axis=1)
  96. # cognition["dc027s2_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s2"]==2 else 0, axis=1)
  97. # cognition["dc027s3_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s3"]==3 else 0, axis=1)
  98. # cognition["dc027s4_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s4"]==4 else 0, axis=1)
  99. # cognition["dc027s5_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s5"]==5 else 0, axis=1)
  100. # cognition["dc027s6_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s6"]==6 else 0, axis=1)
  101. # cognition["dc027s7_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s7"]==7 else 0, axis=1)
  102. # cognition["dc027s8_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s8"]==8 else 0, axis=1)
  103. # cognition["dc027s9_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s9"]==9 else 0, axis=1)
  104. # cognition["dc027s10_score"] = cognition.apply(lambda x : np.nan if not x["wr101_intro"] ==1 else 1 if x["dc047_w4_s10"]==10 else 0, axis=1)
  105. # # cognition["dc027s11_score"] = cognition["dc047_w4_s11"].apply(lambda x : 1 if x==11 else 0 if pd.isna(x) else 0)
  106. # #画图
  107. # cognition["draw_score"] = cognition["dc024_w4"].apply(lambda x : 1 if x==1 else 0 if x==5 else np.nan)
  108. # cognition["Cognition_score"] = cognition["dc001s1_score"] + cognition["dc001s2_score"] + \
  109. # cognition["dc001s3_score"] + cognition["dc002_score"]+ cognition["dc003_score"]+ \
  110. # cognition["dc019_score"]+ cognition["dc020_score"] + cognition["dc021_score"]+ \
  111. # cognition["dc022_score"]+ cognition["dc023_score"] + cognition["dc006s1_score"] + \
  112. # cognition["dc006s2_score"] + cognition["dc006s3_score"] + cognition["dc006s4_score"] + \
  113. # cognition["dc006s5_score"] + cognition["dc006s6_score"] + cognition["dc006s7_score"] + \
  114. # cognition["dc006s8_score"] + cognition["dc006s9_score"] + cognition["dc006s10_score"] + \
  115. # cognition["dc027s1_score"]+ cognition["dc027s2_score"]+ \
  116. # cognition["dc027s3_score"]+ cognition["dc027s4_score"]+ cognition["dc027s5_score"]+ \
  117. # cognition["dc027s6_score"]+ cognition["dc027s7_score"]+ cognition["dc027s8_score"]+ \
  118. # cognition["dc027s9_score"]+cognition["dc027s10_score"]+\
  119. # cognition["draw_score"]
  120. # cognition.to_csv("/root/r_base/CHARLS/test.csv")
  121. # import pandas as pd
  122. # CHARLS_data = pd.read_csv("CHARLS_data_pollutants_p_n_m_nd_h.csv")
  123. # #合并
  124. # merge_list = ["marital_status_m", "Height_m", "Weight_m", "waist_m", "Systolic_m", "Diastolic_m",
  125. # "Physical_activity_m", "Smoke_m", 'Drink_m', 'Hypertension_m', 'Disabetes_or_High_Blood_Sugar_m',
  126. # 'Cancer_or_Malignant_Tumor_m', 'Chronic_Lung_Diseases_m', 'Heart_Problems_m', 'Emotional_Nervous_or_Psychiatric_Problems_m',
  127. # 'Stroke_m', 'Arthritis_or_Rheumatism_m', 'Dyslipidemia_m', 'Liver_Disease_m', 'Kidney_Diease_m', 'Stomach_or_Other_Digestive_Disease_m',
  128. # 'Asthma_m', 'Memory_Related_Disease_m', 'Psychiatric_score_m', 'sleep_state_m', 'Cognition_score_m']
  129. # # 遍历 merge_list 列表
  130. # for col_m in merge_list:
  131. # col = col_m.replace('_m', '') # 去掉 '_m' 得到相应的列名
  132. # if col in CHARLS_data.columns and col_m in CHARLS_data.columns:
  133. # CHARLS_data[col] = CHARLS_data[col_m].fillna(CHARLS_data[col])
  134. # CHARLS_data.to_csv("CHARLS_data_pollutants_p_n_m_nd_h_test.csv")
  135. import base64
  136. def audio_to_base64_string(file_path):
  137. # 打开语音文件并以二进制模式读取
  138. with open(file_path, "rb") as audio_file:
  139. audio_data = audio_file.read()
  140. # 对音频数据进行 Base64 编码
  141. encoded_audio = base64.b64encode(audio_data)
  142. # 将编码后的数据转换为字符串
  143. audio_string = encoded_audio.decode("utf-8")
  144. return audio_string
  145. # 示例调用
  146. file_path = "bjh_rec_0a8ed71a266044c0b72f50c1d3d6be15.wav" # 将此处替换为你的文件路径
  147. audio_string = audio_to_base64_string(file_path)
  148. # 将结果写入到文本文件中,以避免控制台显示乱码
  149. with open("encoded_audio_string.txt", 'w') as output_file:
  150. output_file.write(audio_string)