-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenetic_correlation_sites.R
More file actions
137 lines (131 loc) · 6.62 KB
/
genetic_correlation_sites.R
File metadata and controls
137 lines (131 loc) · 6.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Lindsay V. Clark
# November 18, 2016
# Feel free to modify and distribute. This R function comes with no warranty.
# Function to find genetic correlation between traits measured on different plants,
# e.g. the same trait measured in different environments.
# See equation 8 of Howe et al. (2000 doi:10.1007/s001220051525)
# see also http://www.knowledgebank.irri.org/ricebreedingcourse/Lesson_8_Correlations_among_traits__implications_for_screening.htm
# formula: The formula for the linear model that will be used to get variance components for
# estimating heritability within environment. Should start with "Response" and then include
# terms from the column names of data, formatted for lme4. It is important to include
# genotype (genIDName) and replicates in the model. Environment (siteName) should not
# be included in the model.
# Example: Response ~ (1|Genotype) + (1|Rep)
# Example: Response ~ (1|Genotype) + (1|Year/Rep)
# Example: Response ~ (1|Genotype) + (1|Year/Rep) + (1|Genotype:Year)
# data: A data frame with columns for genotype or family IDs, environment IDs, replicates,
# any other grouping factors, and phenotypes.
# genIDName: Name of column in data containing genotype ID (for estimation of genetic component of
# variance). In the above example this would be "Genotype".
# siteName: The name of the column listing the environment for each observation. These are the
# environments between which genetic correlations will be estimated.
# traitNames: Names of columns containing phenotype data. For each phenotype listed in traitNames,
# a matrix of genetic correlations between environments will be output.
# subset: Subset of data to analyze. (Numeric, boolean, or character vector indicating rows to keep.)
genCorrSites <- function(formula, data, genIDName, siteName, traitNames, subset = 1:dim(data)[1]){
require(lme4)
if(!all(traitNames %in% names(data))){
stop("All traitNames need to be column names of data.")
}
if(length(genIDName) != 1){
stop("Only one genIDName allowed.")
}
if(!genIDName %in% names(data)){
stop("genIDName not found in column names of data.")
}
if(length(siteName) != 1){
stop("Only one sitName allowed.")
}
if(!siteName %in% names(data)){
stop("siteName not found in column names of data.")
}
data <- data[subset,]
nTrait <- length(traitNames)
nObs <- dim(data)[1]
# manipulate formula to build new ones for linear models
OriginalTerms <- attr(terms(formula), "term.labels")
for(i in grep("|", OriginalTerms, fixed = TRUE)){ # fix formatting of random effects
OriginalTerms[i] <- paste("(", OriginalTerms[i], ")", sep = "")
}
# get site names
sites <- as.character(unique(data[[siteName]]))
sites <- sites[!is.na(sites)]
nSites <- length(sites)
# set up lists to contain output
gencorrByTrait <- list()
length(gencorrByTrait) <- nTrait
names(gencorrByTrait) <- traitNames
linemeansByTrait <- nindByTrait <- gencorrByTrait
# loop through traits to do analysis
for(tr in traitNames){
# matrix to contain results
gencorrByTrait[[tr]] <- nindByTrait[[tr]] <- matrix(NA, nrow = nSites, ncol = nSites,
dimnames = list(sites, sites))
# get line means
linemeansByTrait[[tr]] <- tapply(data[[tr]], list(data[[genIDName]], data[[siteName]]),
mean, na.rm = TRUE)
# loop through pairs of sites
for(s1 in sites[-nSites]){
for(s2 in sites[(match(s1, sites) + 1):nSites]){
message(paste("Evaluating", tr, "for", s1, "and", s2))
# genotypes for which there is data for both sites for this trait
theseGen <- dimnames(linemeansByTrait[[tr]])[[1]][!is.na(linemeansByTrait[[tr]][,s1]) &
!is.na(linemeansByTrait[[tr]][,s2])]
nindByTrait[[tr]][s1,s2] <- nindByTrait[[tr]][s2,s1] <- length(theseGen)
if(length(theseGen) == 0) next
# subsets of data frame to use
subset1 <- which(data[[genIDName]] %in% theseGen & data[[siteName]] == s1)
subset2 <- which(data[[genIDName]] %in% theseGen & data[[siteName]] == s2)
# skip if there are too few observations
if(sum(!is.na(data[[tr]][subset1])) <= length(theseGen) |
sum(!is.na(data[[tr]][subset2])) <= length(theseGen)){
next
}
# run linear model to get variances
model1 <- lmer(reformulate(termlabels = OriginalTerms, response = tr),
data = data, subset = subset1)
model2 <- lmer(reformulate(termlabels = OriginalTerms, response = tr),
data = data, subset = subset2)
grp1 <- ngrps(model1) # number of groups
grp1 <- grp1[-grep(genIDName, names(grp1))] # eliminate genotype groups since we don't need those
vc1 <- as.data.frame(VarCorr(model1)) # variances
grp2 <- ngrps(model2)
grp2 <- grp2[-grep(genIDName, names(grp2))]
vc2 <- as.data.frame(VarCorr(model2))
# genetic variance
gvar1 <- vc1$vcov[match(genIDName, vc1$grp)]
gvar2 <- vc2$vcov[match(genIDName, vc2$grp)]
# residual variance divided by total number of replicates
resvar1 <- vc1$vcov[match("Residual", vc1$grp)]/max(grp1)
resvar2 <- vc2$vcov[match("Residual", vc2$grp)]/max(grp2)
# any genotype interaction terms, divided by the appropriate coefficient
intvar1 <- 0
intvar2 <- 0
for(trm in grep(paste(genIDName, ":", sep = ""), vc1$grp)){
thisvar1 <- vc1$vcov[trm]
thisvar2 <- vc2$vcov[trm]
trmName <- substring(vc1$grp[trm], nchar(genIDName)+2, nchar(vc1$grp[trm]))
if(!trmName %in% names(grp1)){
stop("Error matching terms for genotype interaction effects.")
}
thiscoef1 <- grp1[trmName]
thiscoef2 <- grp2[trmName]
intvar1 <- intvar1 + thisvar1/thiscoef1
intvar2 <- intvar2 + thisvar2/thiscoef2
}
# estimate heritability
herit1 <- gvar1/(gvar1 + resvar1 + intvar1)
herit2 <- gvar2/(gvar2 + resvar2 + intvar2)
# estimate correlation between line means
phencor <- cor(linemeansByTrait[[tr]][theseGen,s1],
linemeansByTrait[[tr]][theseGen,s2])
# estimate genetic correlation
gencor <- phencor/sqrt(herit1 * herit2)
# add to output
gencorrByTrait[[tr]][s1,s2] <- gencorrByTrait[[tr]][s2,s1] <- gencor
}
}
}
return(list(line.means = linemeansByTrait, genetic.correlations = gencorrByTrait,
number.of.genotypes.used = nindByTrait))
}