########################
# GETTING STARTED WITH R
########################

# If necessary, download and install R from http://cran.r-project.org.

# Start R. If you have never installed the lme4 package, or need to update it,
# run the following command:

install.packages("lme4", dependencies = T)

# To run commands from a script like this, you can either cut-and-paste into the 
# R Console window, or with your cursor on the desired line (or having highlighted
# multiple lines), press Ctrl-R (Windows) or Command-Enter (Mac).

# Each time you start R, you have to load (not install) the packages you want to use.
# This is done with the library() function. So to load lme4 (mixed-effects models):

library(lme4)


#############################################
# PART ONE: DATA VISUALIZATION AND REGRESSION
#############################################

# Fisher's iris data set (http://en.wikipedia.org/wiki/Iris_flower_data_set) is one
# of the "famous" data sets that are included with R. To load this data, you use
# the data() function.

data(iris)

# This command creates an object called "iris". To check, list the workspace:

ls()

class(iris)	# data.frame

# The "iris" object is a data frame, which is the format that all data should be in
# to carry out regression analysis. Data frames have rows and columns. Each row is
# one observation, and each column represents a different variable. The difference
# between response and predictor variables is not indicated. The head() command is
# a good way to see the structure of the data. It lists the first six rows.

head(iris)

# We can make a few plots to see the relationship between variables. It is possible
# to customize almost everything about an R plot, but this is not always easy.
# We will stick to very simple plots. Many people use the ggplot2 package for plots.

plot(Sepal.Width ~ Sepal.Length, data = iris)

# This plot would be more informative if we colored the symbols by species...

plot(Sepal.Width ~ Sepal.Length, col = Species, data = iris)

# ...and added a primitive legend.

text(6.75, c(4.0, 4.1, 4.2), paste("Iris", levels(iris$Species)), col = 1:3)

# Two species overlap, while one is distinct. Let's try the same thing for petals.

plot(Petal.Width ~ Petal.Length, col = Species, data = iris)
text(2, c(2.1, 2.2, 2.3), paste("Iris", levels(iris$Species)), col = 1:3)


# We see that the species are well separated by the petal data. It looks like a
# straight line could be fit through all these points. This calls for regression.
# Instead of just calling the lm() function for linear regression and printing it
# to the console, we will assign it to an object, giving it the name "model1".

model1 <- lm(Petal.Width ~ Petal.Length, data = iris)

# If we run print(model1), or just model1, we can see the intercept and slope.

model1

# Call:
# lm(formula = iris$Petal.Width ~ iris$Petal.Length)

# Coefficients:
      # (Intercept)  iris$Petal.Length  
          # -0.3631             0.4158  

# The intercept means that hypothetically if the petal length was zero centimeters,
# the petal width is predicted to be -0.3631 centimeters. The more meaningful number
# is the slope, which says that for every centimeter that the petal length increases,
# the petal width (the response variable here) increases by 0.4158 centimeters.

# We can plot this regression line in blue, using the abline() function.

abline(model1, col = "blue")

# The natural question is, would we get a significantly better fit with a more
# complex model? A more complex model always gives a better fit, but the question
# is whether it is significantly better, better than would be expected by chance.

model2 <- lm(Petal.Width ~ Petal.Length + Species, data = iris)

model2

# Call:
# lm(formula = Petal.Width ~ Petal.Length + Species, data = iris)

# Coefficients:
      # (Intercept)       Petal.Length  Speciesversicolor  
         # -0.09083            0.23039            0.43537  
 # Speciesvirginica  
          # 0.83771  

# model2 still has one slope associated with petal length, but it has a different
# intercept for each species. The overall intercept is for the "setosa" species,
# which is first alphabetically. The coefficients for "versicolor" and "virginica"
# are the differences between them and "setosa". The slope in this model is constant.
# We can plot the three regression lines, with some difficulty.

abline(coef(model2)[1], coef(model2)[2], col = "black")
abline(sum(coef(model2)[c(1, 3)]), coef(model2)[2], col = "red")
abline(sum(coef(model2)[c(1, 4)]), coef(model2)[2], col = "green")

# The fit looks surprisingly different. In part this is because many points are
# sitting on top of each other in the scatterplot (each species has 50 observations).
# To test whether model2 is significantly better than model1, we have two choices:

# We can compare the models directly with the anova() function:

anova(model1, model2, test = "Chisq")

# This returns a p-value of 5.482e-10, or 5.482 x 10^-10, or 0.0000000005482.
# In other words, model2 is a highly significant improvement over model1.

# We can also use the summary() function to examine model2.

summary(model2)

# This method shows a separate p-value for each species coefficient. It means
# that "versicolor" is significantly different from "setosa" (p = 4.04e-05), and
# that "virginica" is significantly different from "setosa" (p = 4.71e-08).
# To test "virginica" against "versicolor" we would have to relevel the variable.

# The above examples have been linear regression models, because the response
# variable is a real number, at least in theory. When the response is binary,
# we use logistic regression, which is almost exactly the same except instead of
# predicting a value of the response, given a certain combination of predictors,
# it predicts the log-odds of the response, which is the natural logarithm of
# the odds, p / (1 - p), the probability of one outcome divided by the probability
# of the other. A log-odds of zero means a probability of 0.5. A positive log-odds
# means a greater probability, and a negative log-odds means a lesser one. A rough
# rule of thumb: an effect of +/- 1 log-odds is medium-sized, +/- 2 is large.

# We can load the department store data gathered by Labov in 1962:

ds <- read.csv("http://www.danielezrajohnson.com/ds.csv")
# if no Internet, try locally: ds <- read.csv(file.choose())

head(ds)

# This data set has 729 observations of a response variable "r" (1 means r was
# pronounced, 0 means it was not pronounced), with three predictor variables.

# store: which of the three department stores the observation was made at
# emphasis: whether it was the first repetition ("normal") or the second ("emphatic")
# word: whether it was the word "fourth" or the word "floor"

# Labov did not carry out regression on this data. If we do, we can see that
# only two of his three variables are statistically significant. Which ones?

# To fit a logistic model, use the glm() function and the binomial family, e.g.:

model3 <- glm(r ~ 1, ds, family = binomial)
model4 <- glm(r ~ store, ds, family = binomial)

model5 <- glm(r ~ emphasis + word, ds, family = binomial)
model6 <- glm(r ~ store + emphasis + word, ds, family = binomial)

# Note that the other predictors in a model can affect the significance as well as
# the coefficient estimates of another predictor. The "step-down" comparison (model5
# vs. model6) is usually more reliable than the "step-up" (model3 vs. model4).

# Another point about logistic regression is that it requires more observations
# to obtain the level of results of an equivalent linear regression.


################################################
# PART TWO: WHY DO WE NEED MIXED-EFFECTS MODELS?
################################################

# We saw that fixed-effects regression models overestimate the significance
# (underestimate the p-value) of higher-level predictors when the grouping
# structure of the data is ignored. This is probably their biggest drawback.

# If the data is not only grouped, but the amount of data from each group is
# not equal (unbalanced data), then fixed-effects models will produce estimates
# that are weighted accordingly, which is usually not the desired result.

# And specifically in logistic regression, predictor effects will be shrunken
# if the groups in the data (e.g. speakers) are regularly different.

# The following function compares fixed-effects and mixed-effects results for a
# simulated logistic regression data set, allowing you to vary the true underlying
# parameters. It assumes two groups of speakers, with a binary "between-speaker"
# effect between the groups (e.g. gender). There is also a "within-speaker" effect
# (e.g. speech style) that is constant for all speakers.

# Load the function by cutting-and-pasting (or Ctrl-R or Command-Enter), then
# run the function, experimenting with different parameter settings

compare <- function(speakers, tokens, between, speaker.sd, within){
	w <- rep(c(-within / 2, within / 2),	each = speakers * tokens)
	b <- rep(rep(c(-between / 2, between / 2), each = speakers * tokens / 2), 2)
	s <- rep(rep(rnorm(2 * speakers, 0, speaker.sd), each = tokens / 2), 2)
	dat <- data.frame(w = w, b = b, s = s)
	dat$y <- rbinom(nrow(dat), 1, plogis(dat$w + dat$b + dat$s))
	if (within == 0) dat$w <- rep(c(0, 1), each = speakers * tokens)
	if (between == 0) dat$b <- rep(rep(c(0, 1), each = speakers * tokens / 2), 2)
	if (speaker.sd == 0) dat$s <- rep(rep(1:(2 * speakers), each = tokens / 2), 2)
	m <- glm(y ~ as.factor(b) + as.factor(w), dat, family = binomial)
	fixed.between.e <- summary(m)$coefficients[2, 1]
	fixed.between.p <- summary(m)$coefficients[2, 4]
	fixed.within.e <- summary(m)$coefficients[3, 1]
	fixed.within.p <- summary(m)$coefficients[3, 4]
	mm <- glmer(y ~ as.factor(b) + as.factor(w) + (1 | s), dat, family = binomial)
	mixed.between.e <- summary(mm)$coefficients[2, 1]
	mixed.between.p <- summary(mm)$coefficients[2, 4]
	mixed.within.e <- summary(mm)$coefficients[3, 1]
	mixed.within.p <- summary(mm)$coefficients[3, 4]
	res <- data.frame(
		between.est = c(fixed.between.e, mixed.between.e), 
		between.p = c(fixed.between.p, mixed.between.p),
		within.est = c(fixed.within.e, mixed.within.e),
		within.p = c(fixed.within.p, mixed.within.p))
	rownames(res) <- c("fixed", "mixed")
signif(res, 3)}
	
# The following command says there are 20 speakers per group, 100 tokens per
# speaker, with a between-speaker effect size of 0.5 log-odds, a speaker intercept
# standard deviation of 1.0 log-odds, and a within-speaker effect size of 0.5 log-odds.
# The output gives the effects sizes and p-values for both effects for both models.

compare(20, 100, 0.5, 1.0, 0.5)

      # between.est between.p within.est within.p
# fixed       0.145    0.0222      0.374 4.15e-09
# mixed       0.210    0.4460      0.431 2.45e-10
#
# (Since random numbers are being generated, your results will differ.)

# You should see that the between-speaker significance varies greatly between
# the fixed-effects and mixed-effects models, but the within-speaker effect is
# similar across models. Practice with between = 0, within = 0, and other settings.
# Practice repeating the function with the same settings to see the effect of chance.

# Remember than in a data set with a crossed grouping structure - for example, 
# speaker and word - what is a "between" predictor with respect to one grouping
# factor can be a "within" predictor with respect to the other - and vice versa.
# Using crossed random effects (unlike this function), we can capture this variability.

# If speaker variability (speaker.sd) is very low, or zero, there will be no
# difference between the fixed-effects and mixed-effects models. This shows that
# the glmer() method is "safe" to use even if there might not be group variation.

compare(20, 100, 0.5, 0.05, 0.5)

      # between.est between.p within.est within.p
# fixed       0.398  4.78e-10      0.446 3.02e-12
# mixed       0.398  4.78e-10      0.446 3.02e-12


# You may also notice that if speaker variability is high, it can obscure a real
# between-speaker effect. In this case, the fixed-effects model is more accurate.
# But the fixed-effects model also identifies spurious between-speaker effects,
# so the more conservative behavior of the mixed-effects model is still preferable.


##########################################
# PART THREE: FITTING MIXED-EFFECTS MODELS
##########################################

# To practice fitting models with random intercepts and random slopes, we will use
# a fairly typical sociolinguistic data set. It consists of 3000 observations of a
# binary variable, which is the presence or absence of post-vocalic /r/ on the
# Lower East Side of New York City. This data was collected by Kara Becker.

les.r <- read.csv("http://www.danielezrajohnson.com/les_r.csv")
# if no Internet, try locally: les.r <- read.csv(file.choose())

# This is a binary response variable, so we use logistic regression.
# We are estimating the log-odds of the response, log(p / (1 - p)), where
# p is the probability of a realized post-vocalic /r/.

# This data file has five possible predictor variables. Two of them, speaker and
# word, can be used as grouping factors. Since there are only 7 speakers, this
# could easily be used as a fixed effect. Note that 5 is the minimum recommended
# number of groups for a grouping factor.

# Age is a between-speaker factor. If we can, we would like to know if younger
# speakers are using more post-vocalic /r/. With only two younger speakers and 
# five older speakers, this may be impossible to confirm.

# Stress is a between-word factor. Each word either has a stressed /r/, like "car",
# or an unstressed /r/, like "father". This is a major predictor of /r/ realization.

# Topic - whether the speaker is talking about the neighborhood or not - is neither
# a between-speaker factor nor a between-word factor. Each speaker is sometimes
# talking about the neighborhood, sometimes not. Similarly, any particular word
# can be used to talk about the neighborhood or to talk about something else.

# This matters if we are selecting random slopes. The different speakers could have
# different stress effects, so a random slope (stress | speaker) is possible. The
# different words could conceivably show different age effects, so (age | word) is
# a possible random slope. The effect of topic could vary by speaker and/or by
# word, meaning that both (topic | speaker) and (topic | word) are possible.

# First, however, we will fit a basic fixed-effects model and see what happens.

model7 <- glm(r ~ 1 + age + stress + topic, les.r, family = binomial)

# Looking at summary(model7), we see that as usual, the fixed-effects model is
# reporting incredibly significant p-values for all three predictors.

# What happens to those p-values if we use random effects? First, we will introduce
# random intercepts for speaker and word. We know that /r/ can vary along these lines.

model8 <- glmer(r ~ 1 + age + stress + topic + (1 | speaker) + (1 | word), 
	les.r, family = binomial)
	
# What happens if we use a "maximal" random effect structure, with random slopes
# for all three predictors? Such a maximal structure is the most conservative,
# and is recommended as long as the model fits within an acceptable length of time.

model9 <- glmer(r ~ 1 + age + stress + topic + (1 + stress + topic | speaker) + 
	(1 + age + topic | word), les.r, family = binomial)

# Luckily, model9 took less than a minute to fit. What has happened to the p-values?
# Do we still want to conclude that all three predictors affect the response?

# The summary() function is one way to get p-values, and especially when the
# predictors are binary, this is very convenient. We can also get p-values by
# model comparison.

# Another useful function in this regard is drop1(). It conducts a series of tests
# of the significance of all the predictors in the model. It can be slow.

drop1(model9, test = "Chisq")

# Single term deletions
# Model:
# r ~ 1 + age + stress + topic + (1 + stress + topic | speaker) + 
    # (1 + age + topic | word)
       # Df    AIC     LRT   Pr(Chi)    
# <none>    3503.7                      
# age     1 3509.9  8.1649 0.0042708 ** 
# stress  1 3515.6 13.9107 0.0001917 ***
# topic   1 3506.6  4.8949 0.0269358 *  

# In this section, we have been focusing on p-values. At least for significant
# predictors, we also want to discuss the actual effect size estimates. In a mixed
# model, we also have information about the random effect variances as well as the
# correlations between them.

# It is also possible to look at the individual random effect estimates, which
# are not parameters of the model, but are still useful. This will be discussed
# further in the next section.


##############################################
# PART FOUR: WORKING WITH MIXED-EFFECTS MODELS
##############################################

# The output of a lmer() or glmer() model contains a lot of information. Some of it
# is easy to understand, but other parts require some explanation.

# Let's look at the output of model9. Either

print(model9)

# or simply entering

model9

# will produce the following output:

# Generalized linear mixed model fit by maximum likelihood ['glmerMod']
# Family: binomial ( logit )
# Formula: r ~ 1 + age + stress + topic + (1 + stress + topic | speaker) + 
	(1 + age + topic | word) 
# Data: les.r 
      # AIC       BIC    logLik  deviance 
 # 3503.716  3599.818 -1735.858  3471.716 
# Random effects:
 # Groups  Name        Std.Dev. Corr       
 # word    (Intercept) 0.6808              
         # ageyounger  0.8777    0.03      
         # topicother  0.3554    0.71 -0.44
 # speaker (Intercept) 0.2832              
         # stressyes   0.1162   0.22       
         # topicother  0.2320   0.99  0.34 
# Number of obs: 3000, groups: word, 544; speaker, 7
# Fixed Effects:
# (Intercept)   ageyounger    stressyes   topicother  
    # -1.5999       1.2534       0.7886       0.3622  

# Let's go through this line by line. First, the top four lines:

# # Generalized linear mixed model fit by maximum likelihood ['glmerMod']
# # Family: binomial ( logit )
# # Formula: r ~ 1 + age + stress + topic + (1 + stress + topic | speaker) + 
	# (1 + age + topic | word) 
# # Data: les.r 

# This reminds you what sort of model has been fit, where a "generalized" LMM
# means anything but an ordinary linear mixed model with a numeric response.
# This is a bit imprecise since LMM is one case of GLMM, but it's used this way.

# The next two lines are basically four versions of the same thing. All these models
# are fit by maximum likelihood, which means, more or less, that the software finds
# the parameter values (directly related to the predictor estimates) that make the
# observed data most probable. Of course, the chance of obtaining the exact data
# observed is still very small, even when it is maximized. In this case, the 
# logarithm of the likelihood, at its maximum, is -1735.858.

      # # AIC       BIC    logLik  deviance 
 # # 3503.716  3599.818 -1735.858  3471.716 
 
# The deviance is defined as -2 times the log-likelihood. This statistic is what
# is used to compare models, because under the null hypothesis the deviance follows
# a chi-squared distribution with degrees of freedom equal to the change in
# parameters. What does this mean?

# Say you have a model with one parameter - a simple intercept. You are testing
# whether a model with two parameters - an intercept and a slope, for example -
# offers a significant improvement (it will always offer some improvement, but so
# would adding any parameter). The change in deviance, assuming the true value of 
# the new parameter is zero (and a few other things), is chi-squared distributed.
# This allows us to calculate the p-value. This is called a likelihood ratio test.
# Both anova(m1, m2) and drop1(m1, test = "Chisq") perform likelihood ratio tests.
 
# The AIC and BIC are based on the deviance, penalized for the number of parameters
# in the model. Instead of performing a formal model comparison test, some people
# prefer to choose the model with a lower value of AIC or BIC. I am not fully
# informed on why they do this, or how it is really that different from a LRT.
 
# You can extract these values from any model as follows:
 
logLik(model9)
deviance(model9)
AIC(model9)
BIC(model9)

# None of these four statistics are useful to report, because their values depend on
# the size of the data set. Only models fit to the same data can be compared this way.

# Let's move to the fixed effects part of the output, which fixef() can give us too:

fixef(model9)

# (Intercept)   ageyounger    stressyes   topicother  
    # -1.5999       1.2534       0.7886       0.3622  
    
# The model's prediction is always obtained by adding up these coefficients. The
# problem is sometimes knowing which to add. Here, the intercept value is the
# predicted log-odds for an older speaker, in an unstressed syllable, talking about
# the Lower East Side Neighborhood. If the speaker was younger, we would add 1.2534.
# If the syllable was stressed, we would add 0.7886. If the topic was something else,
# we would add 0.3622.

# We could relevel any of these factors and re-fit the model. For example:

# les.r$topic <- relevel(les.r$topic, "neighborhood")

# All these numbers are in log-odds units, but once they are combined, they can
# be converted back to probabilities using the plogis() function.

plogis(fixef(model9)[1])		# 0.168 (older, unstressed, neighborhood topic)
plogis(sum(fixef(model9)[1:4]))	# 0.691 (younger, stressed, other topic)

# How do these predictions compare to the actual observed proportions? Let's see.

mean(les.r$r[les.r$age == "older" & 
	les.r$stress == "no" & 
	les.r$topic == "neighborhood"])		# 0.224
mean(les.r$r[les.r$age == "younger" & 
	les.r$stress == "yes" & 
	les.r$topic == "other"])			# 0.686

# The intercept cell is quite far from its prediction. There are two likely reasons for
# this. First, if we have unmodeled interactions in the fixed effects, some of the
# cells will not fit well. Second, this model has random effects for speaker and word.
# The fixed effects reported are for a sort of average speaker and word. However, 
# word, especially, tends to be a very skewed variable. There will always be a few 
# very common words, that may favor or disfavor the response. The mixed model
# largely counteracts this weighting.

# Calculating the fixed effects becomes more complicated when there are continuous
# predictors, and especially when there are interactions.

# Finally, let us examine what is unique to the mixed model, the random effects.
# We can also use the VarCorr() extractor to obtain this part of the output:

VarCorr(model9)

 # Groups  Name        Std.Dev. Corr         
 # word    (Intercept) 0.68076               
         # ageyounger  0.87774   0.031       
         # topicother  0.35537   0.706 -0.440
 # speaker (Intercept) 0.28318               
         # stressyes   0.11620  0.224        
         # topicother  0.23204  0.993  0.337 

# Above, we see 12 numbers. These are all parameters of the model, on a par with
# the fixed-effects estimates. This model has 4 fixed-effects parameters and 12
# in the random effects. Of these 12, six are estimates of standard deviations.
# The other six are correlation parameters, which we won't discuss further.

# The model is using the sample of words in the data and making an estimate that
# the population of all words varies - some words favoring the response, some words
# disfavoring it - following a normal distribution with a standard deviation of 0.681.

# That the random effects are normally distributed is one of the assumptions of
# the mixed-effects model, although I have never seen it addressed how bad it is
# if this assumption is not quite met...

# The intercept variation by speaker is 0.283. This is a very low number. It may
# be that with only 7 speakers, the software is not estimating it properly.

# The other standard deviations can be interpreted in the same way, except they are
# for the slopes, not the intercepts. Speakers - all speakers, insofar as they can be
# estimated from a sample of 7 - vary in their stress effect according to a normal
# distribution with a standard deviation of 0.116. They vary in their topic effect
# with a standard deviation of 0.232.

# If these random effects had been larger, we would have seen larger changes to the
# model, as we added them. A large intercept standard deviation for speaker would
# usually take away from the significance of a between-speaker predictor like age.
# A large standard deviation for the stress random slope would lower the overall
# significance of stress. However, all these variance estimates are quite small.

# The final element of the lmer/glmer model is not visible in the print() output.
# These are the so-called BLUPs (best linear unbiased predictors) or "conditional
# modes" of the random effects. We obtain these with the ranef() extractor function.

# ranef(model9)	# too much output!

ranef(model9)$speaker

        # (Intercept)    stressyes  topicother
# Ann     -0.12699043 -0.070881008 -0.11756414
# Lindsey  0.13570318 -0.016811909  0.10338279
# Lucille -0.05119173  0.026939650 -0.03408799
# Mae     -0.25268322 -0.096255293 -0.22313884
# Maggie   0.57757161  0.076152021  0.47551887
# Michael -0.04597223  0.009819603 -0.03402146
# Tobias  -0.08541389  0.044407880 -0.05694121

# These effects are not independent parameters of the model. They are a kind of
# best guess, given the variance parameters that are estimated. In any case, 
# they can be treated as deviations from the corresponding fixed effects.

# Note: As a rule, always include fixed effects corresponding to each random effect!

# So if we take the fixed effects again:

fixef(model9)

# (Intercept)  ageyounger   stressyes  topicother 
 # -1.5999001   1.2534203   0.7885791   0.3621711 

# We know that the age effect does not vary by speaker - this makes no sense - but
# the other fixed effects do: the intercept, the stress effect, and the topic effect.
# We can add the fixed and random effects together to see the total prediction.
# There is another extractor function that does this for you: coef().

coef(model9)$speaker

        # (Intercept) ageyounger stressyes topicother
# Ann       -1.726891    1.25342 0.7176981  0.2446070
# Lindsey   -1.464197    1.25342 0.7717672  0.4655539
# Lucille   -1.651092    1.25342 0.8155188  0.3280831
# Mae       -1.852583    1.25342 0.6923238  0.1390323
# Maggie    -1.022328    1.25342 0.8647312  0.8376900
# Michael   -1.645872    1.25342 0.7983987  0.3281497
# Tobias    -1.685314    1.25342 0.8329870  0.3052299

# Even though there is a column for age, this means nothing and shouldn't be there.
# The age effect of +1.253 has not been applied here, and it should be (to Lindsey 
# and Tobias, the younger speakers) in order to make this a more sensible display.

# Of course, there is a corresponding section of ranef(model9) and coef(model9)
# for the word grouping factor, it was just easier to deal with the subjects.


#################
SETTING CONTRASTS
#################

# Sometimes sum contrasts (or even another type) work better than baseline-treatment
# contrasts for a particular model. You can set contrasts globally or variable-by-
# variable. To set contrasts globally, use one of these commands:

options(contrasts = c("contr.sum", "contr.poly")) 			# sets sum contrasts
options(contrasts = c("contr.treatment", "contr.poly"))	# sets treatment contrasts

# To set the contrasts for an individual variable:

contrasts(les.r$age) <- contr.sum(2)			# 2 is the number of levels
contrasts(les.r$age) <- contr.treatment(2)

# In treatment contrasts, the first level - use levels() to check - is the baseline,
# and does not appear in the output. The other levels are relative to the baseline.

# In sum contrasts, the first level is labeled 1, the second level 2, etc. The
# coefficients represent deviations from the mean. The last level does not appear
# in the output. Its coefficient would be zero minus the sum of the other levels.