---
title: "Minimizing rStress using Majorization"
author: "Jan de Leeuw, Patrick Groenen, Patrick Mair"
date: "Version 011, January 14, 2016"
output:
pdf_document:
keep_tex: yes
number_sections: yes
toc: yes
toc_depth: 3
html_document:
keep_md: yes
number_sections: yes
toc: yes
toc_depth: 3
graphics: yes
fontsize: 12pt
bibliography: rStress.bib
---
```{r packages, echo = FALSE}
library(captioner)
figure_nums <- captioner (prefix = "Figure")
figure_nums (name = "poldist_mfrow", caption = "De Gruijter Data: Separate Configurations", display = FALSE)
figure_nums (name = "poldist_configuration", caption = "De Gruijter Data: Joint Configuration", display = FALSE)
figure_nums (name = "poldist_rstress", caption = "De Gruijter Data: rStress", display = FALSE)
figure_nums (name = "poldist_shepard", caption = "De Gruijter Data: Shepard Diagrams", display = FALSE)
figure_nums (name = "ekman_configuration", caption = "Ekman Color Data: Configuration", display = FALSE)
figure_nums (name = "ekman_shepard", caption = "Ekman Color Data: Shepard Diagrams", display = FALSE)
figure_nums (name = "ekman_rstress", caption = "Ekman Color Data: rStress", display = FALSE)
figure_nums (name = "ekman_tilde", caption = "Ekman Color Data: Power Transformation", display = FALSE)
```
```{r exec_code, echo = FALSE}
torgerson <- function(delta, p = 2) {
doubleCenter <- function(x) {
n <- dim(x)[1]
m <- dim(x)[2]
s <- sum(x) / (n * m)
xr <- rowSums(x) / m
xc <- colSums(x) / n
return((x - outer(xr, xc, "+")) + s)
}
z <-
eigen(-doubleCenter((as.matrix (delta) ^ 2) / 2), symmetric = TRUE)
v <- pmax(z$values, 0)
return(z$vectors[, 1:p] %*% diag(sqrt(v[1:p])))
}
enorm <- function (x, w = 1) {
return (sqrt (sum (w * (x ^ 2))))
}
sqdist <- function (x) {
s <- tcrossprod (x)
v <- diag (s)
return (outer (v, v, "+") - 2 * s)
}
mkBmat <- function (x) {
d <- rowSums (x)
x <- -x
diag (x) <- d
return (x)
}
mkPower <- function (x, r) {
n <- nrow (x)
return (abs ((x + diag (n)) ^ r) - diag(n))
}
matchConf <- function (x,
eps = 1e-6,
itmax = 100,
verbose = TRUE) {
m <- length (x)
n <- nrow (x[[1]])
p <- ncol (x[[1]])
itel <- 1
dold <- Inf
repeat {
y <- matrix (0, n, p)
for (k in 1:m)
y <- y + x[[k]]
y <- y / m
dnew <- 0
for (k in 1:m) {
s <- svd (crossprod(x[[k]], y))
x[[k]] <- tcrossprod (x[[k]] %*% (s$u), s$v)
dnew <- dnew + sum ((y - x[[k]]) ^ 2)
}
if (verbose) {
cat (
formatC (itel, width = 4, format = "d"),
formatC (
dold,
digits = 10,
width = 13,
format = "f"
),
formatC (
dnew,
digits = 10,
width = 13,
format = "f"
),
"\n"
)
}
if ((itel == itmax) || ((dold - dnew) < eps))
break ()
itel <- itel + 1
dold <- dnew
}
return (x)
}
rStressMin <-
function (delta,
w = 1 - diag (nrow (delta)),
p = 2,
r = 0.5,
eps = 1e-10,
itmax = 100000,
verbose = TRUE) {
delta <- delta / enorm (delta, w)
itel <- 1
xold <- torgerson (delta, p = p)
xold <- xold / enorm (xold)
n <- nrow (xold)
nn <- diag (n)
dold <- sqdist (xold)
rold <- sum (w * delta * mkPower (dold, r))
nold <- sum (w * mkPower (dold, 2 * r))
aold <- rold / nold
sold <- 1 - 2 * aold * rold + (aold ^ 2) * nold
repeat {
p1 <- mkPower (dold, r - 1)
p2 <- mkPower (dold, (2 * r) - 1)
by <- mkBmat (w * delta * p1)
cy <- mkBmat (w * p2)
ga <- 2 * sum (w * p2)
be <- (2 * r - 1) * (2 ^ r) * sum (w * delta)
de <- (4 * r - 1) * (4 ^ r) * sum (w)
if (r >= 0.5) {
my <- by - aold * (cy - de * nn)
}
if (r < 0.5) {
my <- (by - be * nn) - aold * (cy - ga * nn)
}
xnew <- my %*% xold
xnew <- xnew / enorm (xnew)
dnew <- sqdist (xnew)
rnew <- sum (w * delta * mkPower (dnew, r))
nnew <- sum (w * mkPower (dnew, 2 * r))
anew <- rnew / nnew
snew <- 1 - 2 * anew * rnew + (anew ^ 2) * nnew
if (verbose) {
cat (
formatC (itel, width = 4, format = "d"),
formatC (
sold,
digits = 10,
width = 13,
format = "f"
),
formatC (
snew,
digits = 10,
width = 13,
format = "f"
),
"\n"
)
}
if ((itel == itmax) || ((sold - snew) < eps))
break ()
itel <- itel + 1
xold <- xnew
dold <- dnew
sold <- snew
aold <- anew
}
return (list (x = xnew,
alpha = anew,
sigma = snew,
itel = itel))
}
```
Note: This is a working paper which will be expanded/updated frequently. One way to think about this paper is as an update of @deleeuw_U_14c, using simpler and more direct majorization methods. The directory [gifi.stat.ucla.edu/rstress](http://gifi.stat.ucla.edu/rstress) has a pdf copy of this article, the complete Rmd file that includes all code chunks, and R files with the code.
#Problem
Define the multidimensional scaling (MDS) loss function
\begin{equation}
\sigma_r(x)=\sum_{i=1}^nw_i(\delta_i-(x'A_ix)^r)^2,\label{E:rstress}
\end{equation}
with \(r>0\) and the \(A_i\) positive semi-definite. The $w_i$ are positive *weights*, the
$\delta_i$ are non-negative *dissimilarities*.
We call this *rStress*. Special cases are *stress* [@kruskal_64a] for \(r=\frac12\), *sstress* [@takane_young_deleeuw_A_77] for \(r=1\), and the loss function used in MULTISCALE [@ramsay_77] for \(r\rightarrow 0\).
Usually MDS is formulated uses Euclidean distances
\(d_{j\ell}(X)\) between *points* \(j\) and \(\ell\), which are rows of an \(n\times p\) *configuration matrix* \(X\). This fits into our
formulation by setting \(x=\mathbf{vec}(X)\) and by setting the \(A_i\) to $np\times np$ matrices of the form \(I_p\otimes E_{j\ell}\), where the matrix \(E_{j\ell}\)
has elements \((j,j)\) and \((\ell,\ell)\) equal to \(+1\) and elements
\((j,\ell)\) and \((\ell,j)\) equal to \(-1\). Thus the $A_i$ are direct sums of $p$ copies of the $E_{j\ell}$. Now \(x'A_ix=d_{j\ell}^2(X)\).
The problem we are trying to solve is to find an algorithm to minimize \(\sigma_r\) over $x$ in $\mathbb{R}^m$ for all values of \(r>0\). It is understood that parts of the algorithm may be different
for different values of $r$.
#Minorization and Majorization
We will design a convergent iterative *majorization algorithm*. Briefly this means constructing for each $r>0$ a *majorization function* \(\gamma_r\) on $\mathbb{R}^m\otimes\mathbb{R}^m$ such that \(\sigma_r(x)\leq\gamma_r(x,y)\) for all $x$ and $y$ and such that $\sigma_r(x)=\gamma_r(x,x)$ for all $x$. One step of the iterative algorithm is
$$
x^{(k+1)}=\text{arg}\min_x\gamma_r(x,x^{(k)}),
$$
unless $x^{(k)}$ already minimizes $\gamma_r(x,x^{(k)})$, in which case we stop.
If we do not stop we have the *sandwich inequality*
$$
\sigma_r(x^{(k+1)})\leq\gamma_r(x^{(k+1)},x^{(k)})<\gamma_r(x^{(k)},x^{(k)})=\sigma_r(x^{(k)}).
$$
Thus majorization algorithms exhibit *monotone convergence* of loss function values.
The history of majorization algorithms is complicated. They were first developed in the specific contexts of step-size estimation in descent algorothms [@ortega_rheinboldt_70], maximum likelihood estimation with missing data [@dempster_laird_rubin_77], and multidimensional scaling [@deleeuw_C_77]. Subsequently they were presented as a general class of optimization methods, and as a special case of block relaxation and augmentation methods, by @deleeuw_C_94c, see also @heiser_95. The material in @deleeuw_C_94c is **(slowly, slowly)** expanded into an e-book, with one part on block relaxation, augmentation and alternating least squares [@deleeuw_E_15a], one part on majorization [@deleeuw_E_15b], and one part on mathematical background [@deleeuw_E_15c]. There have been many applicatons of the general majorization principle by *The Dutch School* in psychometrics, which includes Ten Berge, Kiers, Heiser, Meulman, Groenen, and Vermunt.
Systematic use of majorization in statistics started with @lange_hunter_yang_00. There have been many further reviews, developments, and applications by Ken Lange and his co-workers. Theyo use the name *MM Algorithms*, where MM stands for either majorization-minimization or minorization-maximization. @lange_13 has a very nice chapter on the MM algorithm, and he is currently working on an MM methods book.
#Use of Homogeneity
We start by exploring the obvious fact that
\[
\min_x\sigma_r(x)=\min_{\theta\geq 0}\min_{x'x=1}\sigma(\theta x).
\]
Let us introduce some notation to simplify the discussion. Without loss of generality we assume the dissimilarities are scaled by
\[
\sum_{i=1}^nw_i\delta_i^2=1.
\]
Next, it is convenient to define
\[
\rho_r(x):=\sum_{i=1}^nw_i\delta_i(x'A_ix)^r,
\]
and
\[
\eta_r(x):=\sum_{i=1}^nw_i(x'A_ix)^{2r},
\]
so that
\begin{equation}
\sigma_r(x)=1-2\rho_r(x)+\eta_r(x).\label{E:expand}
\end{equation}
Now
\begin{equation}
\sigma_r(\theta x)=1-2\theta^{2r}\rho_r(x)+\theta^{4r}\eta_r(x).\label{E:homo}
\end{equation}
Thus, as in @deleeuw_C_77, thinking of rStress as a function of both $x$ and $\theta$,
\begin{equation}
\min_\theta\sigma_r(\theta, x)=1-\frac{\rho_r^2(x)}{\eta_r(x)},\label{E:theta}
\end{equation}
where the minimium is attained at
\begin{equation}
\theta^{2r}=\frac{\rho_r(x)}{\eta_r(x)}.\label{E:optt}
\end{equation}
Thus minimizing $\sigma_r$ can be done by maximizing the ratio in $\eqref{E:theta}$ over $x'x=1$. This is used in the
majorization method proposed by @deleeuw_U_14c, by combining @dinkelbach_67 and quadratic majorization.
The approach of @deleeuw_U_14c is somewhat cumbersome, because we first use homogeneity to reduce the MDS problem to a fractional programming problem, and then we use Dinkelbach to get rid of the fractional objective funcion again. Moreover one of the special cases requires us to solve a modified eigenvalue problem of the type discussed, for example, by @hager_01 in each iteration.
In this paper we combine majorization with *alternating least squares*, working directly with (3). Thus we think of rStress as a function of both $\theta$ and $x$, and we alternate minimization over $\theta$ for fixed $x$ and over $x$ for fixed $\theta$. Thus
$$
\begin{split}
\theta^{(k)}&=\text{arg}\min_\theta\sigma_r(\theta, x^{(k)}),\\
x^{(k+1)}&=\text{arg}\min_{x'x=1}\sigma_r(\theta^{(k)},x).
\end{split}
$$
The first step is trivially solved using $\eqref{E:optt}$. Updating $x$, however, is far from trivial and needs the majorization machinery we will develop in this paper. Since updating $x$ is iterative, we have to choose how many majorization steps to take for updating $x$, before going to the next update for $\theta$.
Thus the main focus of the paper will be a majorization method for minimizing
\begin{equation}
\sigma_r(x,\alpha):=1-2\alpha\rho_r(x)+\alpha^2\eta_r(x)\label{E:lambda}
\end{equation}
over $x'x=1$ with $\alpha:=\theta^{2r}$ fixed at its current value. It is clear from $\eqref{E:lambda}$ that we can find a majorization of $\sigma_r$ by combining a minorization of $\rho_r$ and a majorization of $\eta_r$.
#Powers of Quadratic Forms
We start with some lemmas we will need to construct our minorizations and majorizations.
**Lemma 4.1:** \(f_r(x):=(x'Ax)^r\) is convex on \(x'Ax>0\) if and only if \(r\geq\frac12\).
**Proof:** The first and second derivative are
\[
\mathcal{D}f_r(x)=2r(x'Ax)^{r-1}Ax,
\]
and
\[
\mathcal{D}^2f_r(x)=2r(x'Ax)^{r-1}\left(A+2(r-1)\frac{Axx'A}{x'Ax}\right).
\]
The matrix \(H_r(x):= A+
2(r-1)\frac{Axx'A}{x'Ax}\) is psd for \(r=\frac12\), and its eigenvalues increase with \(r\). Thus it is psd for all \(r\geq\frac12\).
Also, if \(00$.
**Lemma 4.2:**
If $r\geq\frac12$ then
\[
f_r(x)\geq(1-2r)f_r(y)+2rf_{r-1}(y)y'Ax.
\]
**Proof:**
Follows from the fact that $f_r$ is convex, i.e. above each of its tangents.
**QED**
Now write \(\overline{\lambda}(X)\) or \(\overline{\lambda}_X\) for the largest eigenvalue of a matrix \(X\),
and \(\underline{\lambda}(X)\) or \(\underline{\lambda}_X\) for the smallest eigenvalue. Note that if \(A=I\otimes E_{j\ell}\) then \(\overline{\lambda}_A=2\).
**Lemma 4.3:** If \(r\geq 1\) then
\[
\overline{\lambda}(\mathcal{D}^2f_r(x))\leq
2r(2r-1)\overline{\lambda}^r_A\ (x'x)^{r-1}.
\]
If \(x'x\leq 1\) then
\[
\overline{\lambda}(\mathcal{D}^2f_r(x))\leq
2r(2r-1)\overline{\lambda}^r_A.
\]
**Proof:** If \(r\geq 1\), then
\[
u'H_r(x)u=u'Au+2(r-1)\frac{(u'Ax)^2}{x'Ax}\leq (2r-1)u'Au.
\]
Thus
\[
\overline{\lambda}(H_r(x))\leq (2r-1)\overline{\lambda}_A,
\]
and
\[
\overline{\lambda}(\mathcal{D}^2f_r(x))\leq 2r(2r-1)\overline{\lambda}_A(x'Ax)^{r-1}
\leq 2r(2r-1)\overline{\lambda}^r_A\ (x'x)^{r-1}.
\]
Finally, if $x'x\leq 1$ then $(x'x)^{r-1}\leq 1$.
**QED**
**Lemma 4.4:**
If \(00\) can be done by minimizing \(\sigma_0\).
Now use
\[
\log x=\lim_{r\rightarrow 0}\frac{x^r-1}{r}.
\]
Thus, for small \(r\),
\[
\log\delta_{i}-\log x'A_ix\approx\frac{\delta_i^r-(x'A_ix)^r}{r},
\]
and
\[
\sigma_0(x)\approx \frac{1}{r^2}\sum_{i=1}^n w_i(\delta_i^r-(x'A_ix)^r)^2.
\]
On the other hand, if \(\delta_i\approx x'A_ix\) then for any \(r\) we have the approximation
\[
\log (x'A_ix)^r\approx\log\delta_i^r+\frac{1}{\delta_i^r}((x'A_ix)^r-\delta_i^r),
\]
so that
\[
\sigma_0(x)\approx \sum_{i=1}^n \frac{w_i}{\delta_i^{2r}}(\delta_i^r-(x'A_ix)^r)^2.
\]
Both approximatations to $\sigma_0$ can be minimized by our iterative majorization algorithm for
$r<\frac12$.
Note that our definition of rStress compares dissimilarities with powers of Euclidean distances.
Alternatively, we can compare powers of dissimilarities with corresponding powers of Euclidean distances, i.e. we can define another rStress as
\begin{equation}
\tilde\sigma_r(x):=\sum_{i=1}^n w_i((\delta_i^2)^r-(x'A_ix)^r)^2.\label{E:tilde}
\end{equation}
This is similar to our approach in the limiting case $r\rightarrow 0$. We do not need an additional algorithm to minimize $\tilde\sigma_r$, because we can just input the \((\delta_i^2)^r\) instead
of $\delta_i$ in our previous majorization method.
The difference between $\sigma_r$ and $\tilde\sigma_r$ disappears, of course, if we make the algorithm nonmetric. In that case we alternate minimization over $x$ with minimization over monotone transformations of $\delta$ (or of $\delta^r$, which is of course the same thing). We could easily add a monotone regression step to our alternating least squares algorithm.
#Examples
##Artificial
Let's start with a small example that shows the flow of the algorithm. We perform one iteration and write out the intermediary results in detail.
Suppose $n=3$, the weights
$w$ are all one, and the dissimilarities are 1, 2, 3. The matrices $A$ are
```{r amat, echo = FALSE}
a <- as.list (1:3)
v <- c(1L, 1L, -1L, -1L)
print (a[[1]] <- outer(v, v))
v <- c(1L, -1L, -1L, 1L)
print (a[[2]] <- outer(v, v))
v <- c(1L, -1L, 1L, -1L)
print (a[[3]] <- outer(v, v))
```
```{r drc, echo = FALSE}
r <- 1
dr <- (4 * r - 1) * sum (sapply (a, function (z) (max (eigen (z)$values)) ^ (2 * r)))
```
The largest eigenvalues of all three $A$ matrices are equal to 4. Thus $\delta_r$ of $\eqref{}$ is `r dr`.
Let's start with $x$ four random normals. We want to minimize rStress for $r=1$, i.e. sStress.
```{r dist, echo = FALSE}
set.seed (12345)
x <- rnorm (4)
delta <- 1:3
print (d <- sapply (a, function (z) sum (x * (z %*% x))))
dd <- d ^ r
sigma <- sum((delta - dd) ^ 2)
rho <- sum (delta * dd)
eta <- sum (dd ^ 2)
alpha <- rho/eta
```
At the start sStress is `r sigma`. Also $\rho$ is `r rho` and $\eta$ is
`r eta`. This means $\alpha$ is `r alpha` and the minimum of rStress over $\alpha$ is
`r sum((delta - alpha * dd) ^ 2)`.
We now compute the matrices $B_r$ and $C_r$ at $x$.
```{r bc, echo = FALSE}
b <- c <- matrix (0, 4, 4)
for (i in 1 : 3) {
b <- b + delta[i] * (d[i] ^ (r - 1)) * a[[i]]
c <- c + d[i] ^ (2 * r - 1) * a[[i]]
}
dr <- 3 * (4 ^ (2 * r)) * (4 * r - 1)
```
For $B_r$ we have
```{r bb, echo = FALSE}
b
```
and for $C_r$
```{r cc, echo = FALSE}
c
```
Also, $\delta_r$ is equal to `r dr` and thus
```{r gstep, echo = FALSE}
g <- drop (((alpha * b) - (alpha ^ 2) * (c - dr * diag(4))) %*% x)
xplus <- - g / sqrt (sum (g ^ 2))
dplus <- sapply (a, function (z) sum (xplus * (z %*% xplus)))
ddplus <- dplus ^ r
sigmaplus <- sum ((delta - alpha * ddplus) ^ 2)
```
Now $g$ is
```{r g, echo = FALSE}
g
```
and the new $x$ is
```{r xplus, echo = FALSE}
xplus
```
With this $x$ sStress is `r sigmaplus`. We now go back, compute a new $\alpha$, and so on.
##Dutch Political Parties
@degruijter_67 collected dissimilarity measures for nine Dutch political parties.
```{r poldist, echo = FALSE}
print(poldist <-
structure(c(5.63, 5.27, 4.6, 4.8, 7.54, 6.73, 7.18, 6.17, 6.72,
5.64, 6.22, 5.12, 4.59, 7.22, 5.47, 5.46, 4.97, 8.13, 7.55, 6.9,
4.67, 3.2, 7.84, 6.73, 7.28, 6.13, 7.8, 7.08, 6.96, 6.04, 4.08,
6.34, 7.42, 6.88, 6.36, 7.36), Labels = c("KVP", "PvdA", "VVD",
"ARP", "CHU", "CPN", "PSP", "BP", "D66"), Size = 9L, call = quote(as.dist.default(m = polpar)), class = "dist", Diag = FALSE, Upper = FALSE))
poldist <- as.matrix (poldist)
```
```{r poldist_compute, echo = FALSE, cache = TRUE}
r <- c(.10, .25, .5, .75, 1, 2)
allp <- as.list (r)
for (i in 1:length (allp))
allp[[i]] <- rStressMin (poldist, r = r[i], verbose = FALSE)
allr <- matchConf (lapply (allp, function (z) z$x), verbose = FALSE)
allq <- array(0, c(9,2,6))
for (i in 1:6) allq[,,i] <- allr[[i]]
allr <- aperm (allq, c(3, 2, 1))
```
The solutions for `r r` are given in figure `r figure_nums("poldist_mfrow", display = "n")`.
```{r poldist_mfrow, echo = FALSE, fig.align = "center"}
par(mfrow=c(2,3), pty = "s")
for (i in 1:6) {
plot (allp[[i]]$x, type = "n", xlim = c(-.6,.6), ylim = c(-.6,.6), main = paste("r = ", r[i]),
xlab = "dimension 1", ylab = "dimension 2")
text (allp[[i]]$x, row.names (poldist), col = "RED")
}
```
`r figure_nums("poldist_mfrow")`
The basic structure for all solutions is the same, we see the familiar *horseshoe*, in which the left-right scale is bended so that extreme left (CPN) and extreme right (BP) are close. This is perhaps most clear for r = .75. If r becomes large, then we see increasing clustering.
We also give all six solutions plotted on top of each other (after Procrustus matching) in figure `r figure_nums("poldist_configurations", display = "n")`. It shows that the two extreme parties are stable over solutions, but the middle is more variable.
```{r poldist_configuration, echo = FALSE, fig.align = "center"}
plot (1:10, xlim = c(-.4,.4), ylim = c(-.5,.5), xlab = "dimension 1", ylab = "dimension 2", type = "n", asp = 1)
for (i in 1:9) {
points (allr[,,i], col = "RED")
lines (allr[,,i], col = "BLUE")
}
```
`r figure_nums("poldist_configuration")`
Plotting rStress in figure `r figure_nums("poldist_rstress", display = "n")` as a function of r shows, somewhat surprisingly perhaps, an increasing function. Because the data are average dissimilarities, it is likely there is a fairly large additive constant, and additive constants correspond with lower values of r.
```{r poldist_rstress, echo = FALSE, fig.align = "center"}
sigma <- sapply (allp, function (x) x$sigma)
plot(r, sigma, xlab = "r", ylab = "sigma", type = "l", col = "RED", lwd = 3)
```
`r figure_nums("poldist_rstress")`
We put the nuber of iterations and the rStress values in a small table.
```{r poldist_iterations, echo = FALSE}
itel <- sapply (allp, function (x) x$itel)
for (i in 1:6)
cat("r: ", formatC(r[i], digits = 2, width = 4, format = "f"),
" iterations: ", formatC(itel[i], digits = 6, format = "d"),
" rStress: ", formatC(sigma[i], digits = 6, width = 8, format = "f"),"\n")
```
`r figure_nums("poldist_shepard", display = "cite")` shows the six Shepard diagrams. We see the clustering for r = 2. For r = .1 the Shepard diagram becomes concave, indicating that the larger dissimilarities are underestimated and reflecting the fact that for small powers the powered distances will all be close to one.
```{r poldist_shepard, echo = FALSE, fig.align = "center"}
par(mfrow=c(2,3))
for (i in 1:6) {
dd <- as.dist(sqdist(allp[[i]]$x)^r[i])
plot (as.dist(poldist), dd, main = paste("r = ", r[i]),
xlab = "dissimilarity", ylab = "distance power", col = "RED")
}
```
`r figure_nums("poldist_shepard")`
##Ekman Color Data
We use the color data from @ekman_54, with two dimensions and unit weights. As we know from previous analyses, MDS of the Ekman data gives a very good fit and a very clear representation of the color circle.
```{r ekman, echo = FALSE}
ekman <-
structure(c(0.86, 0.42, 0.42, 0.18, 0.06, 0.07, 0.04, 0.02, 0.07,
0.09, 0.12, 0.13, 0.16, 0.5, 0.44, 0.22, 0.09, 0.07, 0.07, 0.02,
0.04, 0.07, 0.11, 0.13, 0.14, 0.81, 0.47, 0.17, 0.1, 0.08, 0.02,
0.01, 0.02, 0.01, 0.05, 0.03, 0.54, 0.25, 0.1, 0.09, 0.02, 0.01,
0, 0.01, 0.02, 0.04, 0.61, 0.31, 0.26, 0.07, 0.02, 0.02, 0.01,
0.02, 0, 0.62, 0.45, 0.14, 0.08, 0.02, 0.02, 0.02, 0.01, 0.73,
0.22, 0.14, 0.05, 0.02, 0.02, 0, 0.33, 0.19, 0.04, 0.03, 0.02,
0.02, 0.58, 0.37, 0.27, 0.2, 0.23, 0.74, 0.5, 0.41, 0.28, 0.76,
0.62, 0.55, 0.85, 0.68, 0.76), Size = 14L, call = quote(as.dist.default(m = b)), class = "dist", Diag = FALSE, Upper = FALSE, Labels = c(434,
445, 465, 472, 490, 504, 537, 555, 584, 600, 610, 628, 651, 674
))
ekman <- as.matrix (1-ekman)
print (as.dist (ekman))
wave <- row.names (ekman)
```
```{r ekman_compute, echo = FALSE, cache = TRUE}
r <- c(.10, .25, .5, .75, 1, 2)
allh <- as.list (r)
for (i in 1:length (allh))
allh[[i]] <- rStressMin (ekman, r = r[i], verbose = FALSE)
allz <- matchConf (lapply (allh, function (z) z$x), verbose = FALSE)
allx <- array(0, c(14,2,6))
for (i in 1:6) allx[,,i] <- allz[[i]]
ally <- aperm (allx, c(3, 2, 1))
```
In our algorithm we always start with a classical metric scaling solution [@torgerson_58]. We minimize rStress for r equal to `r r`. The six solutions are plotted jointly in `r figure_nums("ekman_configuration", display = "cite")`. Because of the very good fit, solutions for different values of r are similar and very much on the color circle.
```{r ekman_configuration, echo = FALSE, fig.align = "center"}
plot (1:10, xlim = c(-.3,.3), ylim = c(-.3,.3), xlab = "dimension 1", ylab = "dimension 2", type = "n", asp = 1)
for (i in 1:14) {
points (ally[,,i], col = "RED")
lines (ally[,,i], col = "BLUE")
}
```
`r figure_nums("ekman_configuration")`
The Shepard diagrams in figure `r figure_nums("ekman_shepard", display = "n")` are interesting, because, unlike the configurations, they are quite different, becoming more and more convex as
r increases. The plot for r = .25 looks best.
```{r ekman_shepard, echo = FALSE, fig.align = "center"}
par(mfrow=c(2,3))
for (i in 1:6) {
dd <- as.dist(sqdist(allh[[i]]$x)^r[i])
plot (as.dist(ekman), dd, main = paste("r = ", r[i]),
xlab = "dissimilarity", ylab = "distance power", col = "RED")
}
```
`r figure_nums("ekman_shepard")`
We plot rStress as a function of r in `r figure_nums("ekman_rstress", display = "cite")`. The best fit is attained for r = .25, which means fitting square roots of Euclidean distances to dissimilarities. Nonmetric MDS of the Ekman data has already indicated that the optimal nonmetric fit is attained with a concave increasing transformation.
```{r ekman_rstress, echo = FALSE, fig.align = "center"}
sigma <- sapply (allh, function (x) x$sigma)
plot(r, sigma, xlab = "r", ylab = "sigma", type = "l", col = "RED", lwd = 3)
```
`r figure_nums("ekman_rstress")`
```{r ekman_iterations, echo = FALSE}
itel <- sapply (allh, function (x) x$itel)
for (i in 1:6)
cat("r: ", formatC(r[i], digits = 2, width = 4, format = "f"),
" iterations: ", formatC(itel[i], digits = 6, format = "d"),
" rStress: ", formatC(sigma[i], digits = 6, width = 8, format = "f"),"\n")
```
Especially when $r$ is far from $\frac12$ it will make a difference if we minimize $\sigma_r$ or
the $\tilde\sigma_r$ of $\eqref{E:tilde}$. We illustrate this by choosing $r=.01$, which will presumably take us close to the logarithm.
```{r tilde, echo = FALSE, cache = TRUE}
htb <- rStressMin ((ekman ^ 2) ^ 0.01, r = 0.01, verbose = FALSE)
```
```{r tilde_plot, echo = FALSE, fig.align = "center"}
plot (htb$x, xlim = c(-.4,.4), ylim = c(-.4,.4), col = "RED", xlab = "dimension 1", ylab = "dimension 2", asp = 1, type = "n")
text (htb$x, wave, col = "RED")
lines (htb$x, col = "BLUE")
```
`r figure_nums("ekman_tilde")`
After `r prettyNum(htb$itel)` iterations we find a $\tilde\sigma_r$ value of `r formatC(htb$sigma, digits = 6, format = "f")`. We see a rather clear deviation from the circular pattern we found with
rStress.
#Code
```{r displaycode, eval = FALSE}
torgerson <- function(delta, p = 2) {
doubleCenter <- function(x) {
n <- dim(x)[1]
m <- dim(x)[2]
s <- sum(x) / (n * m)
xr <- rowSums(x) / m
xc <- colSums(x) / n
return((x - outer(xr, xc, "+")) + s)
}
z <-
eigen(-doubleCenter((as.matrix (delta) ^ 2) / 2), symmetric = TRUE)
v <- pmax(z$values, 0)
return(z$vectors[, 1:p] %*% diag(sqrt(v[1:p])))
}
enorm <- function (x, w = 1) {
return (sqrt (sum (w * (x ^ 2))))
}
sqdist <- function (x) {
s <- tcrossprod (x)
v <- diag (s)
return (outer (v, v, "+") - 2 * s)
}
mkBmat <- function (x) {
d <- rowSums (x)
x <- -x
diag (x) <- d
return (x)
}
mkPower <- function (x, r) {
n <- nrow (x)
return (abs ((x + diag (n)) ^ r) - diag(n))
}
matchConf <- function (x,
eps = 1e-6,
itmax = 100,
verbose = TRUE) {
m <- length (x)
n <- nrow (x[[1]])
p <- ncol (x[[1]])
itel <- 1
dold <- Inf
repeat {
y <- matrix (0, n, p)
for (k in 1:m)
y <- y + x[[k]]
y <- y / m
dnew <- 0
for (k in 1:m) {
s <- svd (crossprod(x[[k]], y))
x[[k]] <- tcrossprod (x[[k]] %*% (s$u), s$v)
dnew <- dnew + sum ((y - x[[k]]) ^ 2)
}
if (verbose) {
cat (
formatC (itel, width = 4, format = "d"),
formatC (
dold,
digits = 10,
width = 13,
format = "f"
),
formatC (
dnew,
digits = 10,
width = 13,
format = "f"
),
"\n"
)
}
if ((itel == itmax) || ((dold - dnew) < eps))
break ()
itel <- itel + 1
dold <- dnew
}
return (x)
}
rStressMin <-
function (delta,
w = 1 - diag (nrow (delta)),
p = 2,
r = 0.5,
eps = 1e-10,
itmax = 100000,
verbose = TRUE) {
delta <- delta / enorm (delta, w)
itel <- 1
xold <- torgerson (delta, p = p)
xold <- xold / enorm (xold)
n <- nrow (xold)
nn <- diag (n)
dold <- sqdist (xold)
rold <- sum (w * delta * mkPower (dold, r))
nold <- sum (w * mkPower (dold, 2 * r))
aold <- rold / nold
sold <- 1 - 2 * aold * rold + (aold ^ 2) * nold
repeat {
p1 <- mkPower (dold, r - 1)
p2 <- mkPower (dold, (2 * r) - 1)
by <- mkBmat (w * delta * p1)
cy <- mkBmat (w * p2)
ga <- 2 * sum (w * p2)
be <- (2 * r - 1) * (2 ^ r) * sum (w * delta)
de <- (4 * r - 1) * (4 ^ r) * sum (w)
if (r >= 0.5) {
my <- by - aold * (cy - de * nn)
}
if (r < 0.5) {
my <- (by - be * nn) - aold * (cy - ga * nn)
}
xnew <- my %*% xold
xnew <- xnew / enorm (xnew)
dnew <- sqdist (xnew)
rnew <- sum (w * delta * mkPower (dnew, r))
nnew <- sum (w * mkPower (dnew, 2 * r))
anew <- rnew / nnew
snew <- 1 - 2 * anew * rnew + (anew ^ 2) * nnew
if (verbose) {
cat (
formatC (itel, width = 4, format = "d"),
formatC (
sold,
digits = 10,
width = 13,
format = "f"
),
formatC (
snew,
digits = 10,
width = 13,
format = "f"
),
"\n"
)
}
if ((itel == itmax) || ((sold - snew) < eps))
break ()
itel <- itel + 1
xold <- xnew
dold <- dnew
sold <- snew
aold <- anew
}
return (list (x = xnew,
alpha = anew,
sigma = snew,
itel = itel))
}
```
#NEWS
001 01/12/16
-- First published version, without code and examples
002 01/12/16
-- Added artificial example and code.
003 01/13/16
-- Squashed two nasties
004 01/13/16
-- Added Ekman Example
005 01/13/16
-- Many small edits
006 01/13/16
-- Reorganized proofs
007 01/13/16
-- Added political parties example
008 01/13/16
-- Added configuration matching
009 01/13/16
-- Added Ekman power solution
010 01/14/16
-- Text in examples
-- Additional plots
011 01/14/16
-- small change in theorems and code
#References