##
## Our solution to Exercise #4 (surface collocations and written/spoken keywords)
##
## Question 1. How to install add-on packages from CRAN
# a) use package installer in Windows / Mac OS X GUI
# b) download appropriate file from CRAN, then install from command-line with
# R CMD INSTALL
# c) install using R function install.packages()
install.packages("corpora")
# see ?install.packages for more options
## Question 2. Load library "corpora" and data sets, read data set documentation
library(corpora)
?BNCInChargeOf
data(BNCInChargeOf)
?BNCcomparison
data(BNCcomparison)
## Question 3. Contingency tables for surface cooccurrences.
# According to lecture slides, first row represents word tokens within collocations spans,
# second row word tokens outside collocational spans (for given node word, here the phrase
# "in charge of"). For each collocate W, the first column of the corresponding contingency
# table contains the occurrences of W, and the second column contains all other tokens.
# This leads to the following equations for the observed frequencies O11, O12, O21, O22
COLL <- transform(BNCInChargeOf,
O11 = as.numeric(f.in), O12 = as.numeric(N.in - f.in),
O21 = as.numeric(f.out), O22 = as.numeric(N.out - f.out))
# note that we also convert the new variables to floating-point format, to avoid integer overflow
## Question 4. Marginal frequencies, sample size, expected frequencies, association measures.
# calculate marginal frequencies and sample size as in lecture slides
COLL <- transform(COLL,
R1 = O11 + O12, R2 = O21 + O22,
C1 = O11 + O21, C2 = O12 + O22,
N = O11 + O12 + O21 + O22)
summary(COLL$R1) # should always be same value (N.in)
summary(COLL$R2) # should always be same value (N.out)
summary(COLL$N) # should always be same value (corpus size)
# calculate expected frequencies
COLL <- transform(COLL,
E11 = R1 * C1 / N, E12 = R1 * C2 / N,
E21 = R2 * C1 / N, E22 = R2 * C2 / N)
all.equal(COLL$E11 + COLL$E12 + COLL$E21 + COLL$E22, COLL$N) # check consistency
# calculate association scores: here we use the Dice coefficient for illustration
COLL <- transform(COLL,
Dice = 2 * O11 / (R1 + C1))
# (equation from www.collocations.de/AM)
# now rank data set by Dice scores (could also annotate ranks, which helps to compare different measures)
idx.Dice <- order(COLL$Dice, decreasing=TRUE)
COLL.Dice <- COLL[idx.Dice, ]
head(COLL.Dice[, c("collocate", "f.in", "f.out", "Dice")], 20) # select only relevant columns for better readability
# TMTOWDTI -- lecture slides show other ways for sorting the data set, calculating rankings and extracting n-best lists
## Question 5/6. Keyword identification.
# written and spoken sample sizes (NB: "OTHER" entry for all other nouns allows us to do this)
N.written <- sum(BNCcomparison$written)
N.spoken <- sum(BNCcomparison$spoken)
# Contingency table for frequency comparison, as explained in lecture slides:
# - first column contains data for spoken sample, second column for written sample
# - first row contains frequency counts for given noun in spoken/written sample
# - second row contains sample size - frequency count
# NB: we have swapped order of columns, because some association measures work better
# if the smaller sample is in the first column (as is the case for cooccurrence data)
KEY <- transform(BNCcomparison,
O11 = as.numeric(spoken), O21 = as.numeric(N.spoken - spoken),
O12 = as.numeric(written), O22 = as.numeric(N.written - written))
## Question 7. Marginal frequencies, N, expected frequencies, association measures (as above)
# calculate marginal frequencies and "total sample size" N as above
KEY <- transform(KEY,
R1 = O11 + O12, R2 = O21 + O22,
C1 = O11 + O21, C2 = O12 + O22,
N = O11 + O12 + O21 + O22)
summary(KEY$C1) # should always be same value (spoken sample size)
summary(KEY$C2) # should always be same value (written sample size)
summary(KEY$N) # should always be same value ("total sample size")
# calculate expected frequencies
KEY <- transform(KEY,
E11 = R1 * C1 / N, E12 = R1 * C2 / N,
E21 = R2 * C1 / N, E22 = R2 * C2 / N)
all.equal(KEY$E11 + KEY$E12 + KEY$E21 + KEY$E22, KEY$N) # check consistency
# we don't really want to calculate keyness for the "OTHER" entry, so delete it
KEY <- subset(KEY, noun != "OTHER")
# calculate association scores: here we use MI as some researchers did in terminology extraction
KEY <- transform(KEY,
MI = log2(O11 / E11))
# (equation from www.collocations.de/AM)
# now rank data set by MI scores and list entries with
# - large positive scores (spoken keywords, i.e. higher relative frequency in first column)
# - large negative scores (written keywords, i.e. higher relative frequency in second column)
idx.MI <- order(KEY$MI, decreasing=TRUE)
KEY.MI <- KEY[idx.MI, ]
head(KEY.MI[, c("noun", "written", "spoken", "O11", "E11", "MI")], 20)
tail(KEY.MI[, c("noun", "written", "spoken", "O11", "E11", "MI")], 20)