9 Week 5 Demo

9.1 Setup

First, we’ll load the packages we’ll be using in this week’s brief demo.

devtools::install_github("conjugateprior/austin")
library(austin)
library(quanteda)
library(quanteda.textstats)

9.2 Wordscores

We can inspect the function for the wordscores model by Laver, Benoit, and Garry (2003) in the following way:

classic.wordscores

## function (wfm, scores) 
## {
##     if (!is.wfm(wfm)) 
##         stop("Function not applicable to this object")
##     if (length(scores) != length(docs(wfm))) 
##         stop("There are not the same number of documents as scores")
##     if (any(is.na(scores))) 
##         stop("One of the reference document scores is NA\nFit the model with known scores and use 'predict' to get virgin score estimates")
##     thecall <- match.call()
##     C.all <- as.worddoc(wfm)
##     C <- C.all[rowSums(C.all) > 0, ]
##     F <- scale(C, center = FALSE, scale = colSums(C))
##     ws <- apply(F, 1, function(x) {
##         sum(scores * x)
##     })/rowSums(F)
##     pi <- matrix(ws, nrow = length(ws))
##     rownames(pi) <- rownames(C)
##     colnames(pi) <- c("Score")
##     val <- list(pi = pi, theta = scores, data = wfm, call = thecall)
##     class(val) <- c("classic.wordscores", "wordscores", class(val))
##     return(val)
## }
## <bytecode: 0x2d4acbca8>
## <environment: namespace:austin>

We can then take some example data included in the austin package.

data(lbg)
ref <- getdocs(lbg, 1:5)

And here our reference documents are all those documents marked with an “R” for reference; i.e., columns one to five.

ref

##      docs
## words  R1  R2  R3  R4  R5
##    A    2   0   0   0   0
##    B    3   0   0   0   0
##    C   10   0   0   0   0
##    D   22   0   0   0   0
##    E   45   0   0   0   0
##    F   78   2   0   0   0
##    G  115   3   0   0   0
##    H  146  10   0   0   0
##    I  158  22   0   0   0
##    J  146  45   0   0   0
##    K  115  78   2   0   0
##    L   78 115   3   0   0
##    M   45 146  10   0   0
##    N   22 158  22   0   0
##    O   10 146  45   0   0
##    P    3 115  78   2   0
##    Q    2  78 115   3   0
##    R    0  45 146  10   0
##    S    0  22 158  22   0
##    T    0  10 146  45   0
##    U    0   3 115  78   2
##    V    0   2  78 115   3
##    W    0   0  45 146  10
##    X    0   0  22 158  22
##    Y    0   0  10 146  45
##    Z    0   0   3 115  78
##    ZA   0   0   2  78 115
##    ZB   0   0   0  45 146
##    ZC   0   0   0  22 158
##    ZD   0   0   0  10 146
##    ZE   0   0   0   3 115
##    ZF   0   0   0   2  78
##    ZG   0   0   0   0  45
##    ZH   0   0   0   0  22
##    ZI   0   0   0   0  10
##    ZJ   0   0   0   0   3
##    ZK   0   0   0   0   2

This matrix is simply a series of words (here: letters) and reference texts with word counts for each of them.

We can then look at the wordscores for each of the words, calculated using the reference dimensions for each of the reference documents.

ws <- classic.wordscores(ref, scores=seq(-1.5,1.5,by=0.75))
ws

## $pi
##         Score
## A  -1.5000000
## B  -1.5000000
## C  -1.5000000
## D  -1.5000000
## E  -1.5000000
## F  -1.4812500
## G  -1.4809322
## H  -1.4519231
## I  -1.4083333
## J  -1.3232984
## K  -1.1846154
## L  -1.0369898
## M  -0.8805970
## N  -0.7500000
## O  -0.6194030
## P  -0.4507576
## Q  -0.2992424
## R  -0.1305970
## S   0.0000000
## T   0.1305970
## U   0.2992424
## V   0.4507576
## W   0.6194030
## X   0.7500000
## Y   0.8805970
## Z   1.0369898
## ZA  1.1846154
## ZB  1.3232984
## ZC  1.4083333
## ZD  1.4519231
## ZE  1.4809322
## ZF  1.4812500
## ZG  1.5000000
## ZH  1.5000000
## ZI  1.5000000
## ZJ  1.5000000
## ZK  1.5000000
## 
## $theta
## [1] -1.50 -0.75  0.00  0.75  1.50
## 
## $data
##      docs
## words  R1  R2  R3  R4  R5
##    A    2   0   0   0   0
##    B    3   0   0   0   0
##    C   10   0   0   0   0
##    D   22   0   0   0   0
##    E   45   0   0   0   0
##    F   78   2   0   0   0
##    G  115   3   0   0   0
##    H  146  10   0   0   0
##    I  158  22   0   0   0
##    J  146  45   0   0   0
##    K  115  78   2   0   0
##    L   78 115   3   0   0
##    M   45 146  10   0   0
##    N   22 158  22   0   0
##    O   10 146  45   0   0
##    P    3 115  78   2   0
##    Q    2  78 115   3   0
##    R    0  45 146  10   0
##    S    0  22 158  22   0
##    T    0  10 146  45   0
##    U    0   3 115  78   2
##    V    0   2  78 115   3
##    W    0   0  45 146  10
##    X    0   0  22 158  22
##    Y    0   0  10 146  45
##    Z    0   0   3 115  78
##    ZA   0   0   2  78 115
##    ZB   0   0   0  45 146
##    ZC   0   0   0  22 158
##    ZD   0   0   0  10 146
##    ZE   0   0   0   3 115
##    ZF   0   0   0   2  78
##    ZG   0   0   0   0  45
##    ZH   0   0   0   0  22
##    ZI   0   0   0   0  10
##    ZJ   0   0   0   0   3
##    ZK   0   0   0   0   2
## 
## $call
## classic.wordscores(wfm = ref, scores = seq(-1.5, 1.5, by = 0.75))
## 
## attr(,"class")
## [1] "classic.wordscores" "wordscores"        
## [3] "list"

And here we see the thetas contained in this wordscores object, i.e., the reference dimensions for each of the reference documents and the pis, i.e., the estimated wordscores for each word.

We can now use these to score the so-called “virgin” texts as follows.

#get "virgin" documents
vir <- getdocs(lbg, 'V1')
vir

##      docs
## words  V1
##    A    0
##    B    0
##    C    0
##    D    0
##    E    0
##    F    0
##    G    0
##    H    2
##    I    3
##    J   10
##    K   22
##    L   45
##    M   78
##    N  115
##    O  146
##    P  158
##    Q  146
##    R  115
##    S   78
##    T   45
##    U   22
##    V   10
##    W    3
##    X    2
##    Y    0
##    Z    0
##    ZA   0
##    ZB   0
##    ZC   0
##    ZD   0
##    ZE   0
##    ZF   0
##    ZG   0
##    ZH   0
##    ZI   0
##    ZJ   0
##    ZK   0

# predict textscores for the virgin documents
predict(ws, newdata=vir)

## 37 of 37 words (100%) are scorable
## 
##     Score Std. Err. Rescaled  Lower  Upper
## V1 -0.448    0.0119   -0.448 -0.459 -0.437

9.3 Wordfish

If we wish, we can inspect the function for the wordscores model by Slapin and Proksch (2008) in the following way. This is a much more complex algorithm, which is not printed here, but you can inspect on your own devices.

wordfish

We can then simulate some data, formatted appropriately for wordfiash estimation in the following way:

dd <- sim.wordfish()

dd

## $Y
##      docs
## words D01 D02 D03 D04 D05 D06 D07 D08 D09 D10
##   W01  17  19  22  13  17  11  16  12   6   3
##   W02  18  21  18  16  12  19  11   7  10   4
##   W03  22  21  22  19  11  14  11   3   6   1
##   W04  22  19  18  15  16  13  18   6   2   8
##   W05  28  21  12  10  13  10   5  14   1   3
##   W06   5   7  12  13  15   8  12  13  23  19
##   W07  13   9   5  16  11  17  15  11  35  30
##   W08   8   7   7  10   9  15  18  23  21  23
##   W09   4  12   8  10   9  13  18  25  15  19
##   W10   5   3   7  11  19  16  13  18  17  18
##   W11  66  55  49  48  38  37  27  24  21   6
##   W12  53  56  47  39  49  28  22  15  12  14
##   W13  63  55  47  49  48  31  24  16  17  16
##   W14  57  64  48  51  27  36  24  27  11  12
##   W15  58  48  57  44  36  39  29  27  16   5
##   W16  17  13  24  28  24  32  41  56  67  61
##   W17   9  19  16  36  30  34  53  34  58  57
##   W18  11  19  34  27  42  38  48  58  49  66
##   W19  10  18  27  22  37  52  59  60  60  69
##   W20  14  14  20  23  37  37  36  51  53  66
## 
## $theta
##  [1] -1.4863011 -1.1560120 -0.8257228 -0.4954337 -0.1651446
##  [6]  0.1651446  0.4954337  0.8257228  1.1560120  1.4863011
## 
## $doclen
## D01 D02 D03 D04 D05 D06 D07 D08 D09 D10 
## 500 500 500 500 500 500 500 500 500 500 
## 
## $psi
##  [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
## 
## $beta
##  [1] 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1
## 
## attr(,"class")
## [1] "wordfish.simdata" "list"

Here we can see the document and word-level FEs, as well as the specified range of the thetas to be estimates.

Then estimating the document positions is simply a matter of implementing this algorithm.

wf <- wordfish(dd$Y)
summary(wf)

## Call:
##  wordfish(wfm = dd$Y)
## 
## Document Positions:
##     Estimate Std. Error   Lower    Upper
## D01  -1.4243    0.10560 -1.6313 -1.21736
## D02  -1.1483    0.09747 -1.3394 -0.95727
## D03  -0.7701    0.08954 -0.9456 -0.59455
## D04  -0.4878    0.08591 -0.6562 -0.31942
## D05  -0.1977    0.08414 -0.3626 -0.03279
## D06   0.0313    0.08411 -0.1336  0.19616
## D07   0.4346    0.08704  0.2640  0.60517
## D08   0.7163    0.09140  0.5372  0.89546
## D09   1.2277    0.10447  1.0229  1.43243
## D10   1.6166    0.11933  1.3827  1.85046

9.4 Using `quanteda`

We can also use quanteda to implement the same scaling techniques, as demonstrated in Exercise 4.

8 Week 5: Scaling techniques

10 Week 6: Unsupervised learning (topic models)

9 Week 5 Demo

9.1 Setup

9.2 Wordscores

9.3 Wordfish

9.4 Using quanteda

9.4 Using `quanteda`