R in the Antipodes: 2011

Thursday, 29 December 2011

R: Nutrient intake data, mixed methods analysis for males

To recap:

we've cleaned the data (see earlier posts)
we're up to constructing the output data that will be used to estimate the population distribution values for the particular nutrient we are examining (energy)

The code below is my implementation of the mixtran sas macro for amount, where the nutrient is consumed daily. This means that a bunch of the macro code is irrelevant for the current analysis, as we don't need to estimate daily consumption probabilities, or apply them in the analysis, because probability of consumption = 1.

A boxcox analysis is performed to identify the best transformation to normality prior to undertaking the mixed methods analysis. The data contains age as single year, so I am interested in the effect of analysing age as both

age bands used for nutrient reference values - there are standard age bands used for the reference values, so it makes sense to create age factors using these bands and then analyse age as a factor based on these bands, and
a continuous variable, although because the method below is linear it assumes a linear effect of age on nutrient consumption and this is probably an unrealistic assumption

The R code is below. Code lines mentioned relate to the line in the mixtran macro linked above. Note that this method is using a linear mixed effects model on the data, the SAS macro uses a nonlinear mixed effects model, but I am currently unsuccessful in duplicating that exact model into R. The data that I have is much less complex from a covariates perspective compared to the data that was analysed in SAS, so at this point I am not too worried about the change in this part of the method.
#The data equivalency with the SAS macro data:
#SAS macro variable        SAS                R
#subject                seqn            RespondentID
#response                add_sug            IntakeAmt
#repeat                    drddaycd        IntakeDay
#covars_amt                {Age groups}    AgeFactor        Note: no dummies as AgeFactor is an ordered factor
#replicate_var            rndw1            SampleWeight    Note: used as a frequency variable ("replicate" in SAS), used as weight in R
#seq                    seq2            IntakeDay        Note: no dummy as IntakeDay is an ordered factor
#                        indwt            SampleWeight

#ANALYSIS OF MALE DATA - STEP 1 OF 2
#MIXTRAN
#Delete any unused RespondentID levels that are related to the female subjects
Male.Data$RespondentID <- Male.Data$RespondentID[,drop=TRUE]
#row 419 reduce the data to one record per person to calculate weights
Male.Subset <- subset(Male.Data,!duplicated(RespondentID), select=c(RespondentID, SampleWeight, IntakeAmt, AgeFactor))
#row 446 add the total weights to the data frame
Male.Subset$TotWts <- sum(Male.Subset$SampleWeight)
#row 455 create the subgroup weights, these are all by age only
Male.Grpwt <- aggregate(Male.Subset$SampleWeight,by= data.frame(Male.Subset$AgeFactor),sum)
names(Male.Grpwt)[names(Male.Grpwt)=="Male.Subset.AgeFactor"] <- "AgeFactor"
names(Male.Grpwt)[names(Male.Grpwt)=="x"] <- "GrpWts"
Male.Subset <- merge(Male.Subset,Male.Grpwt, by=c("AgeFactor"))
#row 472 get the number of persons in total and by subgroup if given
#the SAS macro produces a dataset of 1 row per agegroup, with agegrp, num subjects in age group, total num subjects output
count <- function(x) {
length(na.omit(x))
}
Numsub <- aggregate(Male.Subset$RespondentID,by=data.frame(Male.Subset$AgeFactor),count)
names(Numsub)[names(Numsub)=="Male.Subset.AgeFactor"] <- "AgeFactor"
names(Numsub)[names(Numsub)=="x"] <- "NumSubjects"
Numsub$TotSubjects <- sum(Numsub$NumSubjects)
#row 492 merge numsub to _persons
Male.Subset <- merge(Male.Subset,Numsub, by=c("AgeFactor"))
Male.Subset <- Male.Subset[order(Male.Subset$RespondentID),]
#END OF GENERAL SET UP
#convert proc transreg, from row 803
#NOTE: CODE ONLY USED TO IDENTIFY THE LAMBDA VALUE BETWEEN 0.05 AND 1 ASSOCIATED WITH THE MAXIMUM LOG LIKELIHOOD
#fitting a linear model with a BoxCox transformation, on the data that includes replicates
library(MASS)
Male.Boxcox <- as.data.frame(with(Male.Data, {
    boxcox(IntakeAmt ~ AgeFactor + IntakeDay,
    lambda= seq(0.05, 1, 0.05), plotit=FALSE)
    }))
#locate the row containing the largest log likelihood
which.max(Male.Boxcox$y)
Lambda.Value <- Male.Boxcox[Male.Boxcox$y == max(Male.Boxcox$y),1 ]
#row 818 add boxcox values to Male.Data
Male.Data$BoxCoxXY <- (Male.Data$IntakeAmt^Lambda.Value-1)/Lambda.Value
#clean up the R workspace by deleting data frames no longer required
objects()
rm(Imported.Data, Long.Data, Male.Boxcox, Male.Grpwt, Numsub)
#row 840 through nlmixed starting at row 996, do all the data preparation in one hit
#working through nlmixed lines now, prework on variable construction
#not sure why subject is fit as a nonlinear effect, fit using a linear method
#use a model with an intercept for fixed effects
library(lme4)
Male.lme1 <- lmer(BoxCoxXY ~ AgeFactor + IntakeDay + (1|RespondentID),
            data = Male.Data,
            weights = SampleWeight)
summary(Male.lme1)
plot(fitted(Male.lme1), residuals(Male.lme1))
Male.Data$lmefits <- fitted(Male.lme1)
#Compare with using age as a continuous variable
Male.Boxcox.Age <- as.data.frame(with(Male.Data, {
    boxcox(IntakeAmt ~ Age + IntakeDay,
    lambda= seq(0.05, 1, 0.05), plotit=FALSE)
    }))
which.max(Male.Boxcox.Age$y)
Lambda.Value.Age <- Male.Boxcox.Age[Male.Boxcox.Age$y == max(Male.Boxcox.Age$y),1 ]
Male.Data.Age$BoxCoxXY <- (Male.Data.Age$IntakeAmt^Lambda.Value.Age-1)/Lambda.Value.Age
#Age as continuous fixed effect
Male.lme2 <- lmer(BoxCoxXY ~ Age + IntakeDay + (1|RespondentID),
            data = Male.Data,
            weights = SampleWeight)
summary(Male.lme2)
plot(fitted(Male.lme2), residuals(Male.lme2))
Male.Data$Agefits <- fitted(Male.lme2)

Some highlights for me:

The same transformation was identified as the best for both analyses.
An ANOVA comparison of the two models showed no significant improvement from using age as a continuous variable, so the age band method is retained for subsequent analyses:

anova(Male.lme1,Male.lme2)
Data: Male.Data
Models:
Male.lme2: BoxCoxXY ~ Age + IntakeDay + (1 | RespondentID)
Male.lme1: BoxCoxXY ~ AgeFactor + IntakeDay + (1 | RespondentID)
          Df    AIC     BIC logLik Chisq Chi Df Pr(>Chisq)
Male.lme2 5 9894.4 9926.4 -4942.2
Male.lme1 7 9966.2 10011.1 -4976.1     0      2          1

The residuals plot for the age band analysis is reasonably nicely behaved given there are >4000 data points, although there could be a slight upwards trend:

The output from the age band lmer analysis is:

Linear mixed model fit by REML

Formula: BoxCoxXY ~ AgeFactor + IntakeDay + (1 | RespondentID)

Data: Male.Data

AIC BIC logLik deviance REMLdev

9994 10039 -4990 9952 9980

Random effects:

Groups Name Variance Std.Dev.

RespondentID (Intercept) 0.19408 0.44055

Residual 0.37491 0.61230

Number of obs: 4498, groups: RespondentID, 2249

Fixed effects:

Estimate Std. Error t value

(Intercept) 13.98016 0.03405 410.6

AgeFactor4to8 0.50572 0.04084 12.4

AgeFactor9to13 0.94329 0.04159 22.7

AgeFactor14to18 1.30654 0.04312 30.3

IntakeDayDay2Intake -0.13871 0.01809 -7.7

Correlation of Fixed Effects:

(Intr) AgFc48 AgF913 AF1418

AgeFactr4t8 -0.775

AgeFctr9t13 -0.761 0.634

AgFctr14t18 -0.734 0.612 0.601

IntkDyDy2In -0.266 0.000 0.000 0.000

Wednesday, 28 December 2011

Nutrient intake data, finalising the data in R

I run plain R in the normal gui under Windows 7, which means no bells and whistles. This means that I find the R gui somewhat awkward to program in. Thanks to advice I received a number of years ago, I use Notepad ++ as my programming environment. It has line numbering, and when you use Language > R through the menu to set the programming language, you get colour coded syntax. It also has the nice feature of emphasizing the current bracket set that you are using, which makes it very easy to see whether you have remembered to close all your brackets - it counts backward from the last open bracket.

We're finally in R. :) The code below sets up the data sets for nutrient intake analysis, which will be the subject of my next posts. If you're following along in the SAS macro, the code is the R substitute for the data preparation in the starting macro called "example1_amount_mixtran_distrib.sas" from the Example 1 zip file which is downloadable from this webpage if you don't want to download the zip immediately.

SAS syntax files, which are identifiable by the .sas as a file extension, can be viewed with any text reader, and I use Notepad++ for that as well. If you open that SAS syntax file, the code below prepares the data for analysis in the mixtran macro, basically down to line 114.

You'll notice that I comment my code a lot, probably more than most. That is because I have had numerous experiences of coming back to code I wrote 6 months, or a couple of years earlier, and needing to revise it. I have found that what is obvious at the time of programming may not be so obvious as time passes and other programming projects have been completed.

You'll see the use of the reshape2 package. The data is basically a repeated measures design, as there are two 24-hour recall periods for nutrient intake per person. The data coming in from the .csv file constructed earlier has one row per person, with the nutrient intakes as two variables. For repeated measures, the data analysis later requires one row per intake (i.e. two rows per person). As this is the main data preparation stage, it makes sense to reform the data frame now.

While I cannot supply the data at this point, I will post the header() result before the melt so you can see the type of data in the data frame.

#This section of code duplicates the SAS code from example1_amount_mixtran_distrib.sas from line 1 through line 114
#Read in the Australian energy data
Imported.Data <- read.csv("foo.csv",header=T)
#check that headers have imported fine
names(Imported.Data)
length(Imported.Data)
nrow(Imported.Data)
#sort the data frame by subject
Imported.Data <- Imported.Data[order(Imported.Data$RespondentID),]#check sort worked, look at first few observations
head(Imported.Data)
#melt data frame so that each repeated measure (intake) is one row, and
#create factor to indicate whether it's a day1 or day2 intake.
#remember that reshape2 package must be installed at this point
library(reshape2)
Long.Data <- melt(Imported.Data, id=1:6, variable="IntakeDay",
measured=c("Day1Intake", "Day2Intake"))
names(Long.Data)[names(Long.Data)=="value"]<-"IntakeAmt"
#construct age group factors, lowest age group number = youngest age group
#age groups for analysis are set here (latest edition): http://www.nhmrc.gov.au/guidelines/publications/n35-n36-n37
#ASSUMPTION: no children <1 year old in data
#construct one variable that contains all the age factors
#evaluate from lowest to highest age, evaluation stops when condition is first met
#evaluate from lowest to highest age, evaluation stops when condition is first met
Long.Data$AgeF <-ifelse(Long.Data$Age<=3,1, ifelse(Long.Data$Age<=8,2, ifelse(Long.Data$Age<=13,3,
   ifelse(Long.Data$Age<=18,4, ifelse(Long.Data$Age<=30,5, ifelse(Long.Data$Age<=50,6,
   ifelse(Long.Data$Age<=70,7, ifelse(Long.Data$Age>70,8,""))))))))
Long.Data$AgeFactor <- as.factor(Long.Data$AgeF)
levels(Long.Data$AgeFactor) <- c("1to3","4to8","9to13","14to18","19to30","31to50","51to70","71Plus")
table(Long.Data$AgeF, Long.Data$AgeFactor)#Delete AgeF and any unused AgeFactor levels
Long.Data$AgeF <- NULL
Long.Data$AgeFactor <- Long.Data$AgeFactor[,drop=TRUE]
#Make RespondentID into a factor, it should not be treated as numeric
Long.Data$RespondentID <- as.factor(Long.Data$RespondentID)
#males and females are analysed separately, do not need to be specified as factors,
#construct different data frames for each - the code will duplicate the analysis for the second gender
#ASSUMPTION: males = 1 and females = 2
Male.Data <- subset(Long.Data, Gender==1)
Female.Data <- subset(Long.Data, Gender==2)

The result from head(Long.Data) is:
NutrientID RespondentID Gender Age BodyWeight SampleWeight Day1Intake Day2Intake
1        267       100013      2 15       59.4    0.3335521   8591.535   8747.908
2        267       100020      1 12       51.6    0.4952835 12145.852 13495.798
3        267       100050      2 15       62.1    0.3335521 14202.496 13724.582
4        267       100100      2   4       18.5    0.3563699   8621.690   6218.391
5        267       100128      2   2       13.2    0.1666111   5140.690   6427.673
6        267       100370      2   7       24.9    0.3563699   7418.029 13620.542

Nutrients consumed daily, R analysis story board

With any relatively complicated programming task, I prefer to first create a story board that provides the steps I will take to complete the overall task. This may end up being a heavily edited post if I find that my plans change part way through.

The point of this analysis is to produce a cleaned distribution of nutrient intakes for the Australian population, using two 24-hour intake recall periods. The general method being followed is from here where the method has been implemented for SAS. I was a SAS programmer for around 12 years, but never had much to do with macros as all my programming tasks were one-offs. This has meant that understanding the SAS code has taken me a reasonable period of time.

The approach will use the data sets constructed in the previous blog post, which I cleaned in Excel using VBA.

The main steps are:

Identify a test data set (in this case, energy (KJ) intake),
Reprogram the SAS code into R, hard coding in the variable names from the cleaned data set as these will be standard for the data analysis of the various nutrients,
Compare the output distribution to that obtained by the previous method of analysis, then if all goes well
Output the cleaned distribution to a .csv file for subsequent use by the client, and then finally
Automate the process for the client, e.g. include GUI features for the client to select the input .csv data set, so the user does not need to change any of the R code.

The method will use a number of R packages as well as the standard installation. For example,

the MASS package is used to Box Cox transform the data to normality,
the reshape2 package is required to melt the data so that repeated measures are separate observations, not separate variables - this will double the length of the data set as all observations have two 24-hour recall periods, and
mixed effects analysis is being undertaken using lme4.

I have been working with two of the SAS macros for the past couple of months, and the R code will be dramatically shorter compared to the SAS code as there will be no interim data sets output. Because this process is only addressing nutrients consumed daily (rather than those consumed episodically, e.g. alcohol, or for foods rather than nutrients), the SAS code is simplified into R by not having to generate probability distributions for intake. Once this R analysis has been implemented, I will rework it for the episodic case. To ensure that the correct R program is used, I will incorporate a "missing value check" to ensure that the correct program is used. For the "consumed daily" nutrients, the data set contains only observations with consumption so by definition there is no missing data - but it is a good idea to check all assumptions.

Along the way, I will be doing some additional testing. The NCI method linked above uses a number of covariates, such as age group. The client has been analysing the data (not using the NCI method) separately by age group. I will be testing the effect on the overall intake distribution by:

analysing separately for age group (3 age groups), versus
using age group as fixed effect covariates, versus
using age-in-years as a continuous fixed effect variable instead of age group covariates (there are >4000 observations so there are multiple observations per age-in-years)

Tuesday, 27 December 2011

Data preparation for analysis, use VBA

Data on daily nutrient intake information is stored within a database, and the users are extracting the relevant data on a nutrient-by-nutrient basis through an export procedure. The exported data is being stored in Excel 2010 data sets, one for each nutrient. There is a set of variables being saved in each export, as well as an optional variable that does not need passing to R, but - when it is present - is situated between subsets of relevant variables. The names of the variables are not exactly the same between the Excel data sets, nor is the starting row for the variable names, and a couple of variable names are saved across two rows instead of one. This creates a bit of a headache for creating a .csv file to import into R.

For one file, the starting rows look like:

Energy, including dietary fibre. 2007 ANCNPAS. Including supps.

nutrient_code	Respondent ID	Gender	Age	Body weight	samplewt	Intake- day1	Intake- day2	Units
267	100013	2	15	59.4	0.333552084	8591.5354	8747.9084	KJ
267	100020	1	12	51.6	0.495283471	12145.8524	13495.798	KJ

For another file, the starting rows look like:

iodine
with supplements
							day1	day 2
nut_code	id	sex	age	Body weight	seifa	samplewt	Intake	Intake	Units	Upper safe level (UL)
315	100013	2	15	59.4		0.33355208	103.7881	128.07576	ug	900
315	100020	1	12	51.6		0.49528347	140.46202	218.31528	ug	600
315	100050	2	15	62.1		0.33355208	118.21546	184.33722	ug	900

Because there are multiple data sets that will need conversion to R, the best method is to construct some VBA code in an Excel workbook to handle the data cleaning and preparation. The saving grace is that the relative order of the variables appears to be the same across the different data sets.

The code below cleans both files using a button click event in Excel 2010. The cleaned data is saved as a .csv file, using the same file name as the input .xlsx file and into the same directory. The Respondent IDs are integer, but must be dimmed as Long because they exceed the maximum value that can be stored in an Integer type. The variables are defined as arrays, and then redimmed to the correct length just before they have values allocated, so the method will work with data sets of varying numbers of observations. For those unfamiliar with VBA, the ' is used to comment code.

The Open file has been restricted to only Excel 2010 workbooks - this should save the user from scrolling through irrelevant non-Excel files. Because we have constructed the .csv filename to have the correct file extension, we just need to put in the file name at the end but we do need to amend the type.

Sub Button1_Click()
    Dim FilePath As String
    Dim FileName As String
    Dim CSVFile As String
    Dim WB As Workbook
    Dim StartRow As Range
    Dim Count As Integer                    'Used to store the size of the array for each variable, can only be a whole number
    Dim i As Integer                        'Count in the values to each array
    Dim NutrientCode() As Integer           'Numeric code for nutrient, same value for all rows in the same sheet
    Dim Respondent() As Long                'Respondent IDs are larger than 32K so need to make these long rather than integer
    Dim Gender() As Integer                 'Sex, codes are 1 or 2
    Dim Age() As Integer
    Dim BodyWeight() As Double
    Dim SampleWeight() As Double
    Dim Day1Intake() As Double
    Dim Day2Intake() As Double

    On Error GoTo Err_Clr

    'From http://msdn.microsoft.com/en-us/library/ff834966.aspx
    'Get workbook name
    FilePath = Application _
        .GetOpenFilename("Excel Files (*.xlsx), *.xlsx")
        If FilePath <> False Then
        'MsgBox "Open " & FilePath
        End If

    'Save filename and path for output to csv
    FileName = Dir(FilePath)
    'MsgBox "The filename is " & FileName
    CSVFile = Left(FileName, Len(FileName) - 4) & "csv"
    'MsgBox "The filename is " & CSVFile

    'Open workbook
    Set WB = Workbooks.Open(FilePath)
    WB.Activate

    'Locate variable name row
    Range("A1").Select
    Do Until (ActiveCell.Value = "nutrient_code") Or (ActiveCell.Value = "nut_code")
        ActiveCell.Offset(1, 0).Select
        Loop
    Set StartRow = ActiveCell
    'MsgBox "Startrow is with cell " & StartRow
    'Get Count value
    Count = 0
    Do While ActiveCell.Value <> ""
        ActiveCell.Offset(1, 0).Select
        Count = Count + 1
        Loop
    'Count is 1 too high due to looping to empty row, reduce by 1
    Count = Count - 1
    'MsgBox "The number of observations is " & Count
    'Go back to variable name row
    StartRow.Select
    'Nutrient intakes
    ReDim NutrientCode(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        NutrientCode(i) = ActiveCell.Value
        Next
    'Respondent IDs
    StartRow.Select
    ActiveCell.Offset(0, 1).Select
    ReDim Respondent(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        Respondent(i) = ActiveCell.Value
        Next
    'Gender
    StartRow.Select
    ActiveCell.Offset(0, 2).Select
    ReDim Gender(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        Gender(i) = ActiveCell.Value
        Next
    'Age
    StartRow.Select
    ActiveCell.Offset(0, 3).Select
    ReDim Age(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        Age(i) = ActiveCell.Value
        Next
    'Body Weight
    StartRow.Select
    ActiveCell.Offset(0, 4).Select
    ReDim BodyWeight(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        BodyWeight(i) = ActiveCell.Value
        Next
    'need to check for seifa column at this point
    'Sample Weight
    StartRow.Select
    ActiveCell.Offset(0, 5).Select
    If ActiveCell.Value = "seifa" Then ActiveCell.Offset(0, 1).Select
    ReDim SampleWeight(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        SampleWeight(i) = ActiveCell.Value
        Next
    'Day 1 intake
    StartRow.Select
    ActiveCell.Offset(0, 5).Select
    If ActiveCell.Value = "seifa" Then
        ActiveCell.Offset(0, 2).Select
        Else
            ActiveCell.Offset(0, 1).Select
        End If
    ReDim Day1Intake(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        Day1Intake(i) = ActiveCell.Value
        Next
    'Day 2 intake
    StartRow.Select
    ActiveCell.Offset(0, 5).Select
    If ActiveCell.Value = "seifa" Then
        ActiveCell.Offset(0, 3).Select
        Else
            ActiveCell.Offset(0, 2).Select
        End If
    ReDim Day2Intake(Count)
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        Day2Intake(i) = ActiveCell.Value
        Next

    'Save all the array values to a new workbook
    Workbooks.Add
    'Output Nutrient ID
    ActiveCell.Value = "NutrientID"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = NutrientCode(i)
        Next
    'Output Respondent
    Range("B1").Select
    ActiveCell.Value = "RespondentID"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = Respondent(i)
        Next
    'Output Gender
    Range("C1").Select
    ActiveCell.Value = "Gender"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = Gender(i)
        Next
    'Output Age
    Range("D1").Select
    ActiveCell.Value = "Age"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = Age(i)
        Next
   'Output Body Weight
    Range("E1").Select
    ActiveCell.Value = "BodyWeight"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = BodyWeight(i)
        Next
    'Output SampleWeight
    Range("F1").Select
    ActiveCell.Value = "SampleWeight"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = SampleWeight(i)
        Next
    'Output Day 1 Intake
    Range("G1").Select
    ActiveCell.Value = "Day1Intake"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = Day1Intake(i)
        Next
    'Output Day 2 Intake
    Range("H1").Select
    ActiveCell.Value = "Day2Intake"
    For i = 1 To Count
        ActiveCell.Offset(1, 0).Select
        ActiveCell.Value = Day2Intake(i)
        Next

    'Save the worksheet as a csv file for R
    ActiveWorkbook.SaveAs FileName:=CSVFile, FileFormat:=xlCSV

Err_Clr:
    If Err <> 0 Then
        Err.Clear
        Resume Next
        End If


End Sub