当前位置:首页|资讯|编程|GPT-4|Midjourney

CHARLS数据库常用变量提取代码

作者:熊大学习社发布时间:2024-06-05

CHARLS数据库常用变量提取代码

诺维医学科研官网:https://www.newboat.top 更新换版中!

B站:文章对应的讲解视频在此。熊大学习社 https://space.bilibili.com/475774512

微信公众号|Bilibili|抖音|小红书|知乎|全网同名:熊大学习社

公益网站,https://nwzz.xyz/ ,内有医学离线数据库、数据提取、科研神器等高质量资料库

诺维之舟AI:https://gpt4.nwzz.xyz 可在线使用GPT4|GPT3.5|Midjourney

课程相关资料:

(1)课程资料包括CHARLS数据库常用变量提取代码讲义。关注公众号“熊大学习社”,回复“charls2406”,获取资料。

我们坚持学以致用,做有质量的分享。关注熊大学习社,您的一键三连是我最大的动力。

(2)一对一论文指导学员免费获取学习课程和专属答疑。了解咨询扫客服二维码。

年龄 age/ 性别 sex / 教育 education/ 婚姻 marriage/ 住所 residence

baseline_demo<-read_dta('2015/Demographic_Background.dta')baseline_demo %>%  transmute(ID,                              age= 2015-ba004_w3_1,                             gender=ifelse(ba000_w2_3==1,'Male','Female'),                             education=ifelse(bd001_w2_4>=6,'high school+',                                               ifelse(bd001_w2_4==1, 'Illiterate',                                                     ifelse(bd001_w2_4 %in% c(2,3,4), 'primary','middle school'))),                             marital = ifelse(be001 %in% c(1,2), 'married','single'),                             residence = ifelse(bb001_w3_2 %in% c(1,2),'urban','rural')                            )

read_dta('2015/Biomarker.dta')%>%mutate(height = ifelse(qi002>300|qi002<100, NA, qi002),                                                  weight = ifelse(ql002>300, NA, ql002),                                                  bmi = weight/(height/100)^2)# 肥胖分类obesity = ifelse(bmi>=24,'Yes','No')# 腰围waist = ifelse(qm002>=500,NA,qm002)

# 第一种方法,最大值read_dta('2015/Biomarker.dta')%>%mutate(g1 = ifelse(qc003>300, NA, qc003),                                                  g2 = ifelse(qc004>300, NA, qc004),                                                  g3 = ifelse(qc005>300, NA, qc005),                                                  g4 = ifelse(qc005>300, NA, qc005),                                                  grip = pmax(g1,g2,g3,g4,na.rm = T),                                                  )# 第二种方法,均值grip_strength = rowMeans(select(., g1:g4), na.rm = TRUE)

# 第一种方式read_dta('2015/health_status_and_functioning.dta') %>% mutate(    smoking = ifelse(da059==1, 'yes','no' ))# 第二种方式read_dta('2015/health_status_and_functioning.dta') %>% mutate(    smoking = ifelse(da059==2,'Non-smoker', ifelse(da061==2,'Ex-smoker','Smoker')))# 更多信息read_dta('2015/health_status_and_functioning.dta') %>% mutate(    # 吸烟分类    smoking = ifelse(da059==2,'Non-smoker', ifelse(da061==2,'Ex-smoker','Smoker')),    # 吸烟时间    da065_1,    # 吸烟年份    da065_2,    # 吸烟时间    smoking_duration = ifelse(!is.na(da065_1),age-da065_1,2011-da065_2),    # 吸烟时间分类    smoking_duration_cat = cut(smoking_duration, breaks = c(-Inf, 19, 39, Inf),                                labels = c("<20", "20-39", ">=40")),    # 吸烟量    smoking_num = da063,    # 吸烟量分类    smoking_num_cat = cut(smoking_num, breaks = c(-Inf, 19, 39, Inf),                           labels = c("<20", "20-39", ">=40")),    # 开始吸烟年龄    age_str_smk = ifelse(!is.na(da065_1), da065_1, age-(2011-da065_2)),    # 开始吸烟年龄分组    age_str_smk_cat = cut(age_str_smk, breaks = c(-Inf, 12, 17,25, Inf),                           labels = c("<13", "13-17",'18-25', ">=25")),    # 结束吸烟年龄    age_quit_smk = ifelse(!is.na(da062_1), da062_1, age-(2011-da062_2)),    # 吸烟累计包数:吸烟时间*吸烟量/20根一包    smoking_pac_yr = ifelse(smoking=='Non-smoker',0, smoking_duration*da063/20),    # 吸烟包数分类    smoking_pac_yr_cat = cut(smoking_pac_yr, breaks=c(-Inf,0,14,29,Inf),                              labels = c('0','<15','15-30','>=30')),    # 结束吸烟分组    age_quit_smk_cat = case_when(      age_quit_smk < 40 ~ "<40",      age_quit_smk < 50 ~ "<50",      age_quit_smk < 60 ~ "<60",      age_quit_smk >= 60 ~ '>=60')    )

read_dta('2015/health_status_and_functioning.dta') %>% mutate(drinking = ifelse(da067==1,'yes','no'))

functional limitation [activities of daily living (ADL),功能限制[日常生活活动

instrumental activities of daily living (IADL),日常生活工具活动

# 第一种方法,积分read_dta('2015/Health_Status_and_Functioning.dta')%>%mutate(Func_limt = db010+db011+db012+db013+db014+db015+db016+db017+db018+db019+db020)# ADLADL_disability = ifelse(db010==4|db011==4|db012==4|db013==4|db014==4|db015==4,1,0),# IADLIADL_disability = ifelse(db016==4|db017==4|db018==4|db019==4|db020==4,1,0),

health_status_and_functioning提取。

# 第一种方法depression = (dc009-1)+(dc010-1)+(dc011-1)+(dc012-1)+(dc014-1)+(dc015-1)+(dc017-1)+(dc018-1)# 第二种方法CESD = as.numeric(dc009)+as.numeric(dc010)+  as.numeric(dc011)+as.numeric(dc012)+(-as.numeric(dc013)+5)+  as.numeric(dc014)+as.numeric(dc015)+(-as.numeric(dc016)+5)+  as.numeric(dc017)+as.numeric(dc018)-10,depression = ifelse(CESD>=10,'Y','N'),

Health_Status_and_Functioning中提取。

health = case_when(  da001 %in% c(1,2,3)|da002 %in% c(1,2)~'Good',  da001==4|da002==3 ~'Fair',  da001==5|da002 %in% c(4,5)~'Poor')

包括身体虚弱、消耗、体重减轻等,从Health_Status_and_Functioning中提取。

weakness=ifelse(db008==1,0,1),snowness = ifelse((db003==1&db005==1)|is.na(db003)|is.na(db005),0,1),exhaustion = ifelse(dc018 %in% c(3,4)|dc012 %in% c(3,4),1,0),lowpa = ifelse(da051_2_==2|da051_3_==2,1,0),weightloss = ifelse(da047==2,1,0)sum_vars = rowSums(select(., weakness, snowness, exhaustion, lowpa, weightloss), na.rm = TRUE),Frailty = ifelse(sum_vars>=3,1,0),

14个,Health_Status_and_Functioning中提取。

# 全部提取read_dta('2015/Health_Status_and_Functioning.dta')%>%mutate(chronic = rowSums(select(., starts_with("zda007_")), na.rm = TRUE))# 分类提取# 心血管疾病,心脏病和中风CVD = ifelse(da007_8_==1|da007_7_==1,'Yes','No')# 心脏病Heartfailure = ifelse(da007_7_==1,'Yes','No')# 中风Stroke = ifelse(da007_8_==1,'Yes','No')# 逐个提取HTN = case_when(  zda007_1_==1|da007_1_==1|da007_w2_2_1_==1~'Y',  TRUE~'N'),CA = case_when(  zda007_4_==1|da007_4_==1|da007_w2_2_4_==1~'Y',  TRUE~'N'),CLD = case_when(  zda007_5_==1|da007_5_==1|da007_w2_2_5_==1~'Y',  TRUE~'N'),HD = case_when(  zda007_7_==1|da007_7_==1|da007_w2_2_7_==1~'Y',  TRUE~'N'),STK = case_when(  zda007_8_==1|da007_8_==1|da007_w2_2_8_==1~'Y',  TRUE~'N'),MentalD = case_when(  zda007_11_==1|da007_11_==1|da007_w2_2_11_==1~'Y',  TRUE~'N'),AR = case_when(  zda007_13_==1|da007_13_==1|da007_w2_2_13_==1~'Y',  TRUE~'N'),DL = case_when(  zda007_2_==1|da007_2_==1|da007_w2_2_2_==1~'Y',  TRUE~'N'),LiverD = case_when(  zda007_6_==1|da007_6_==1|da007_w2_2_6_==1~'Y',  TRUE~'N'),CKD = case_when(  zda007_9_==1|da007_9_==1|da007_w2_2_9_==1~'Y',  TRUE~'N'),DigestD = case_when(  zda007_10_==1|da007_10_==1|da007_w2_2_10_==1~'Y',  TRUE~'N'),Asthma = case_when(  zda007_14_==1|da007_14_==1|da007_w2_2_14_==1~'Y',  TRUE~'N')

health_status_and_functioningbiomarkers两个表提取。

# 收缩压1sbp1 = ifelse(qa003>300,NA,qa003),# 收缩压2sbp2 = ifelse(qa007>300,NA,qa007),# 收缩压3sbp3 = ifelse(qa011>300,NA,qa011),# 舒张压1dbp1 = ifelse(qa004>200,NA,qa004),# 舒张压2dbp2 = ifelse(qa008>200,NA,qa008),# 舒张压3dbp3 = ifelse(qa012>200,NA,qa012),   # 收缩压,均值sbp=(sbp1+sbp2+sbp3)/3,# 舒张压,均值dbp=(dbp1+dbp2+dbp3)/3,# 高血压,HypertensionHypertension = ifelse((sbp<140&dbp<90)&(is.na(da007_1_)|da007_1_!=1)&                      (is.na(da011s1)|(da011s1!=1))&(is.na(da011s2)|da011s2!=1),'No','Yes')

Health_status_and_functioningBlood两个表提取。

# 糖尿病,DiabetesDiabetes = ifelse((newhba1c<6.5)&(is.na(da007_3_)|da007_3_!=1)&(is.na(da014s1)|da014s1!=1)&              (is.na(da014s2)|da014s2!=1),'No','Yes')

Blood两个表提取。

# 血脂症,DyslipidemiaDyslipidemia = ifelse(newcho>=6.19*38.6|newtg>=2.27*88.5|newldl>=4.14*38.6,'Yes','No')

Demographic_BackgroundExit_Interview提取。

# 全因死亡all_cause_mortality = ifelse(!is.na(exb001_1), 1, ifelse(!is.na(eligible),0,NA)),# 生存时间follow_up_time = ifelse(all_cause_mortality==1, exb001_1-2011,2),# 死亡年龄death_age = exb001_1-ba002_1,# 是否早亡premature_death = ifelse((all_cause_mortality==1&(gender=='Male')  &(death_age<72.7))|                        ( all_cause_mortality==1&(gender=='Female')&(death_age<76.9)),1,0)

Health_status_and_functioning提取。

socialactivity = case_when(da056s12==11~'N',TRUE~'Y')

Health_status_and_functioning提取。

sleepquality = factor(dc015,levels=c(1,2,3,4),                      labels=c('Rarely or none of the time',                               'Some or a little of the time',                               'Occasionally or a moderate amount of the time',                               'Most or all of the time'))# 睡眠时间,2013年数据sleephr = da049

Health_status_and_functioning提取。

# 第一种方法life_satisfy = ifelse(dc028 %in% 1:2,'Good', ifelse(dc028 %in% 3:4,'Fair','Poor')),# 第二种方法life_satisfy = (-as.numeric(dc028))+5,

Health_status_and_functioning提取。

# 2013年数据hearing = ifelse(da039 %in% 1:3,'Good',                 ifelse(da039==4,'Fair','Poor')),vision = ifelse(da033 %in% 1:3|da034 %in% 1:3,'Good',                 ifelse(da039==4|da034 %in% 1:3,'Fair','Poor')),

Health_status_and_functioning提取。

# 2013年数据pain = ifelse(rowSums(is.na(select(., starts_with("da042s"))))==15,0,1),

Blood两个表提取。

# 葡萄糖glucose = newglu,# 甘油三脂TG = newtg*0.011,

**VAI ,内脏脂肪指数(VAI)**由人体学测量指标和血脂指标组成。计算方法如图

CVAI = ifelse(gender=='Male',              -267.93+0.68*age+0.03*BMI+4*WC+22*log10(TG)-16.32*hdl,              -187.32+1.71*age+4.23*BMI+1.12*WC+39.76*log10(TG)-11.66*hdl)

# 第一种方法,2018年数据congnition<-read_dta('2018/Cognition.dta') %>% mutate(    mental = (dc001_w4==1)+(dc002_w4==1)+(dc003_w4==1)+(dc005_w4==1)+(dc006_w4==1)+              (dc014_w4_1_1==93)+(dc014_w4_2_1==86)+(dc014_w4_3_1==79)+(dc014_w4_4_1==72)+             (dc014_w4_5_1==65)+(dc024_w4==1),    immediate = ((dc028_w4_s1==1)+(dc028_w4_s2==2)+(dc028_w4_s3==3)+(dc028_w4_s4==4)+                 (dc028_w4_s5==5)+(dc028_w4_s6==6)+(dc028_w4_s7==7)+(dc028_w4_s8==8)+                 (dc028_w4_s9==9)+(dc028_w4_s10==10)+(dc029_w4_s1==1)+(dc029_w4_s2==2)+                 (dc029_w4_s3==3)+(dc029_w4_s4==4)+(dc029_w4_s5==5)+(dc029_w4_s6==6)+                 (dc029_w4_s7==7)+(dc029_w4_s8==8)+(dc029_w4_s9==9)+(dc029_w4_s10==10))/2,    delayed =(dc047_w4_s1==1)+(dc047_w4_s2==2)+(dc047_w4_s3==3)+(dc047_w4_s4==4)+             (dc047_w4_s5==5)+(dc047_w4_s6==6)+(dc047_w4_s7==7)+(dc047_w4_s8==8)+             (dc047_w4_s9==9)+(dc047_w4_s10==10),    recall = (immediate+delayed)/2,    follow_up_cognition = recall+mental)# 第二种方法,2013年数据VS = ifelse(dc025==1,1,0),delay = (dc027s1 %in% 1)+(dc027s2%in%2)+(dc027s3%in%3)+               (dc027s4%in%4)+(dc027s5%in%5)+(dc027s6%in%6)+(dc027s7%in%7)+               (dc027s8%in%8)+(dc027s9%in%9)+(dc027s10%in%10),immediate =(dc006_1_s1%in%1)+(dc006_1_s2%in%2)+(dc006_1_s3%in%3)+  (dc006_1_s4%in%4)+(dc006_1_s5%in%5)+(dc006_1_s6%in%6)+(dc006_1_s7%in%7)+  (dc006_1_s8%in%8)+(dc006_1_s9%in%9)+(dc006_1_s10%in%10),memory = (immediate+delay)/2,orientation = (dc001s1%in%1)+(dc001s2%in%2)+(dc001s3%in%3)+(dc002%in%1)+  (dc003%in%1),  attention = (dc019%in%93)+(dc020%in%86)+(dc021%in%79)+(dc022%in%72)+    (dc023%in%65),cognition = memory+orientation+attention+VS,

小结

课程相关资料:

(1)课程资料包括CHARLS数据库常用变量提取代码讲义。关注公众号“熊大学习社”,回复“charls2406”,获取资料。

我们坚持学以致用,做有质量的分享。关注熊大学习社,您的一键三连是我最大的动力。

(2)一对一专属论文指导,一篇SCI指导直到录用,具体扫码咨询助理老师。