diff --git a/000inputs.tex b/000inputs.tex
new file mode 100644
index 0000000..ca1ac6f
--- /dev/null
+++ b/000inputs.tex
@@ -0,0 +1,35 @@
+\input{010texfoo}
+
+\title{\mytitle}
+
+\begin{document}
+  \maketitle
+  \input{060abstract}
+
+  % set the TOC with larger line spacing
+  \begin{spacing}{1.1}
+  \tableofcontents
+  \end{spacing}
+  \newpage
+  \pagestyle{plain}
+
+  \input{300}
+
+  	\section{\label{sec:systemoverview}System Overview} 
+  	\input{310}
+	
+	\section{\label{sec:preprocessing}Pre-Processing: Harvesting Web Pages}
+	\input{320}
+	
+	\section{\label{sec:manualannotation}Manual Annotation: Classification of Web-Page Content by Human Annotators}
+	\input{330}
+	
+	\section{\label{sec:goldstandard}The Gold Standard: Compilation and Analysis of manually annotated Data}
+	\input{340}
+	
+	%\section{Summary}
+	%\input{350}
+    
+    \clearpage
+    \input{900bib}
+\end{document}
diff --git a/010texfoo.tex b/010texfoo.tex
new file mode 100644
index 0000000..7ceaa04
--- /dev/null
+++ b/010texfoo.tex
@@ -0,0 +1,174 @@
+\documentclass[11pt,a4paper,oneside,liststotoc,listsleft,abstract=true]{scrartcl}
+
+% selectively in/exclude pieces of text
+\usepackage{comment}
+\includecomment{longversion}
+
+% have koma type headings in rm font
+%\addtokomafont{sectioning}{\rmfamily}
+
+
+%%%% package imports (order matters)
+\usepackage{lineno}
+
+% specifies the encoding of this file (and \include or \input files)
+\usepackage[latin1]{inputenc}
+
+% in pdfTeX an active font can only refer to 256 glyphs at a time;
+% select the std. T1 mapping for this document
+\usepackage[T1]{fontenc}
+
+% activate hyphenation
+\usepackage[english]{babel}
+
+% activate character protruding for margin kerning, 
+% i.e. try to get a 'smoother' margin
+%\usepackage[activate]{pdfcprot}
+\usepackage[protrusion=true,expansion,kerning=true]{microtype}
+
+% activate some symbols, e.g. \textmusicalnote (and more 'important' ones...)
+\usepackage{textcomp}
+
+% activate 'pretty' code listings
+\usepackage{listings}
+
+% activate ip alphabet
+\usepackage{tipa}
+
+% activate the Almost European computer modern font (cf. http://www.ctan.org/tex-archive/fonts/ae/)
+%\usepackage{ae}
+% XOR
+% TeX Gyre (cf. http://www.tug.dk/FontCatalogue)
+\usepackage{tgheros}
+\usepackage{tgtermes}
+% XOR
+% activate springer's minion and myriad font
+%\pdfmapfile{+springer.map}
+%\renewcommand{\sfdefault}{fmy}
+%\renewcommand{\rmdefault}{fmnx}
+%\renewcommand{\ttdefault}{lmtt}
+
+% allow for inclusion of pdf documents
+\usepackage{pdfpages}
+
+%
+%\usepackage[right=7cm,left=2.5cm,top=2cm,bottom=3.5cm]{geometry}
+\usepackage[top=3.0cm,bottom=4.0cm]{geometry}
+\usepackage{setspace}
+
+\usepackage{epic,eepic}
+\usepackage{graphicx}
+\graphicspath{{./}{./images/}}
+% this will produce a warning:
+% LaTeX Warning: Command \@makecol has changed.
+% seems to occur in combination with the setspace package.
+\usepackage[stable, bottom]{footmisc}
+%\usepackage{fullpage}
+\usepackage{url}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{tabularx}
+%\usepackage[pdftex]{color}
+\usepackage[
+	pdftex,
+	final=true,
+	pdfstartview=FitH
+	]{hyperref}
+
+%%%% hyper & options
+\definecolor{fuchsia}{rgb}{1,0,1}
+\definecolor{myblue}{rgb}{0.25,0.25,0.75}
+\definecolor{darkblue}{rgb}{0,0,0.75}
+\definecolor{darkred}{rgb}{0.4,0,0}
+\hypersetup{%
+colorlinks=true,
+bookmarks=true,
+bookmarksnumbered=true,
+bookmarksopen=true,
+bookmarksopenlevel=2,
+pdftitle={\mypdftitle},
+pdfauthor={\myauthor},
+pdfsubject={\mytitle},
+pdfkeywords={\mykeywords},
+pdfproducer={pdflatex, inkscape, gnuplot},
+frenchlinks=true,
+pdfborder=0 0 0,
+linkcolor=myblue,
+%pagecolor=darkblue,
+urlcolor=myblue,
+citecolor=darkred,
+setpagesize=true
+}
+
+%
+% align numbering in TOC on the left side, i.e.
+% 1
+% 1.1
+% 1.1.1
+% ...
+%\usepackage{tocloft}
+%\usepackage{chngcntr}
+%\setlength{\cftchapnumwidth}{\cftsubsubsecnumwidth}
+%\setlength{\cftsecnumwidth}{\cftsubsubsecnumwidth}
+%\setlength{\cftsubsecnumwidth}{\cftsubsubsecnumwidth}
+%\setlength{\cftsubsubsecnumwidth}{\cftsubsubsecnumwidth}
+%\setlength{\cftsecindent}{0pt}
+%\setlength{\cftsubsecindent}{0pt}
+%\setlength{\cftsubsubsecindent}{0pt}
+
+%%%% fancy & options
+%\usepackage{fancyhdr}
+%\pagestyle{fancy}
+%\renewcommand{\footrulewidth}{0.5pt}
+%\renewcommand{\headrulewidth}{0.5pt}
+%\setlength{\headheight}{25pt}
+%\setlength{\headsep}{20pt}
+%\renewcommand{\chaptermark}[1]{\markboth{\quad #1}{\quad #1}}
+%\renewcommand{\sectionmark}[1]{\markright{#1}}
+%\fancyhf{}
+%\fancyfoot[CE]{\myauthor}
+%\fancyfoot[CO]{\myStitle}
+%\fancyhead[LE,RO]{\bfseries\thepage}
+%\fancyhead[RE]{\bfseries\leftmark }
+%\fancyhead[LO]{\bfseries\rightmark }
+
+% do not reset footnote count or every chapter
+%\counterwithout*{footnote}{chapter}
+
+%%%% indexing options
+\setcounter{tocdepth}{3}
+\setcounter{secnumdepth}{3}
+%\newcounter{lofdepth}
+%\setcounter{lofdepth}{3}
+
+%%%% new commands
+\DeclareMathOperator{\project}{project}
+\DeclareMathOperator{\CC}{CC}
+\DeclareMathOperator{\NCC}{NCC}
+\newcommand{\src}[1]{\texttt{#1}}
+\newcommand{\fref}[1]{\src{#1} (c.f. \ref{#1})}
+\newcommand{\email}[1]{\href{mailto:#1}{#1}}
+\newcommand{\grad}[0]{^\circ}
+\newcommand{\fig}[4]
+{%
+ \begin{figure}[h]
+  \centering
+  \includegraphics[width=#1\textwidth]{#2}
+  \caption{#3}
+  \label{#4}
+ \end{figure}
+}
+
+%\renewcommand*{\raggedsection}{}
+
+\renewenvironment{abstract}{%
+ \addsec*{\abstractname}
+}
+
+%%%% typesetting options
+\unitlength10mm
+\renewcommand*{\tabularxcolumn}[1]{>{\small}m{#1}}
+
+
+%\usepackage{natbib}
+%\bibliographystyle{plainnat}
diff --git a/060abstract.tex b/060abstract.tex
new file mode 100644
index 0000000..e0d1a64
--- /dev/null
+++ b/060abstract.tex
@@ -0,0 +1,6 @@
+\begin{abstract}
+This document describes the KrdWrd CANOLA Corpus.
+
+The CANOLA Corpus is a visually annotaded English web corpus for training the KrdWrd classification engine to remove boiler plate on unseen web pages.
+It was harvested, annotaded and evaluated by the tools and infrastructur of the KrdWrd Project.
+\end{abstract}
diff --git a/300.tex b/300.tex
new file mode 100644
index 0000000..0c050bb
--- /dev/null
+++ b/300.tex
@@ -0,0 +1,16 @@
+\begin{longversion}
+The KrdWrd Project\cite{krdwrd.org} deals with the design of an abstract architecture for 
+A)~the unified treatment of Web data for automatic processing, \emph{without} neglecting visual information, on annotation and processing side and 
+B)~the appropriate annotation tool to gather data for supervised processing of such data.
+
+The Project comprises an implementation appropriate for pre-processing and cleaning of Web pages, where users are provided with accurate Web page presentations and annotation utilities in a typical browsing environment, while machine learning (ML) algorithms also operate on representations of the visual rendering of Web pages.
+The system also preserves the original Web documents and all the additional information contained therein to make different approaches comparable on identical data.
+
+The system is sketched in \cite{StegerStemle2009}.
+
+For training the KrdWrd ML Engine, a substantial amount of hand-annotated data, viz.~Web pages, are needed. 
+Following, we present the parts of the system that cover the acquisition of training data, i.e.~the steps before training data can be fed into a ML Engine. 
+
+Then, after an overview of the sequence of steps needed to gather new training data in \ref{sec:systemoverview}, an in-depth description of the processing steps \emph{before} Web pages can be presented to annotators in \ref{sec:preprocessing}, presentation of the actual tool annotators use in \ref{sec:manualannotation}, and the compilation of their submitted results in \ref{sec:goldstandard}, we will be ready to feed the KrdWrd Gold Standard to a ML Engine.
+%An exemplification, the KrdWrd ML Engine, is covered in \ref{cha:krdwrdsys2}.
+\end{longversion}
diff --git a/310.tex b/310.tex
new file mode 100644
index 0000000..c396e59
--- /dev/null
+++ b/310.tex
@@ -0,0 +1,50 @@
+\begin{longversion}
+%
+%
+Two fundamental ideas behind this part of the system are:
+firstly, Web pages have a textual representation, namely the text they contain, a structural representation, namely their DOM tree, and a visual representation, namely their rendered view -- all representations should be considered when automatically cleaning Web pages, and consequently, all should be annotated during acquisition of training data for ML tasks.
+Secondly, data acquisition for training of supervised ML algorithms should preserve pristine, unmodified versions of Web pages -- this will help to reproduce results \emph{and} to compare those of different architectures.
+
+% What
+\subsection{Functional Walk-Through}
+
+%What, Who, How, Result
+Gathering a set of sample pages is at the beginning of tagging new data. 
+The process needs to be coordinated by the administrators of the system, i.e.~server level access is needed to make new corpora available for later tagging by users. 
+The Process starts with a list of seed terms which are used to construct an ad-hoc corpus of Web pages where the result is a list of Uniform Resource Locators (URL\footnote{see \cite{URL} for details -- but also \cite{w3.org/Addressing}.}).  
+ 
+%What, Who, How, Result
+The URL list is then \emph{harvested}, i.e.~the according Web pages are downloaded and saved for further processing. 
+This process is coordinated by the administrators of the system and is started as automated batch-job on the server where its input is the URL List and the result is the set of downloaded Web pages and their content.
+
+%What, Who, How, Result
+These Web Pages are then available online to users for tagging, i.e.~there are no constraints on who is able to access these pages; 
+however, keeping track of \emph{who tagged what} requires to differentiate between users, and hence, registration with the system, viz.~logging in. 
+The Web pages are accessible via the KrdWrd Add-on in combination\footnotemark~with the Web Services hosted on \cite[Web Site]{krdwrd.org}.
+\footnotetext{Indeed, the data is accessible with \emph{any} browser -- but the KrdWrd Add-on enhances the experience.}
+
+%What, Who, How, Result
+Users can tag new, alter or redisplay formerly tagged Web pages with the help of the KrdWrd Add-on.
+The KrdWrd Add-on builds upon and extends the functionality of the Firefox \cite{firefox} browser and facilitates the visual tagging of Web pages, i.e.~users are provided with an accurate Web page presentation and annotation utility in a typical browsing environment.
+Readily (or partly) tagged pages are directly sent back to the server for storage in the KrdWrd Corpus data pool and for further processing.  
+
+%What, Who, How, Result
+Updated or newly submitted tagging results are regularly merged, i.e.~submitted results from different users for the same content are processed and compiled into a majority-driven uniform view.
+This automated process uses a \emph{winner takes all strategy} and runs regularly on the server -- without further ado.
+The \emph{merged} content is stored in the KrdWrd data pool and hence, available for browsing, viewing, and analysis by the KrdWrd Add-on\footnotemark[\value{footnote}] and furthermore, it can be used as training data for Machine Learning algorithms. 
+
+
+% What
+\subsection{Implementation Survey}
+
+The KrdWrd Infrastructure consists of several components that bring along the overall functionality of the system. 
+They are run either on the KrdWrd Server or are part of the KrdWrd Add-on and hence, build upon and extend the functionality of the Firefox browser.
+The Server components are hosted on a Debian GNU/Linux \cite{debian.org} powered machine.
+However, the requirements\footnote{These include sed, awk, python, bash, subversion, XULRunner, wwwoffle, apache, R.} are rather limited and many other standard linux - or linux-like - systems should easily suffice, and even other platforms should be able to host the system. 
+Nevertheless, the KrdWrd Add-on strictly runs only as an extension of the Firefox browser, version 3\footnote{But it could be converted into a self-contained XULRunner application.}.
+
+Access to the system is given as HTTP Service hosted on \url{krdwrd.org}, an SSL-certified virtual host running on an Apache Web Server \cite{httpd.apache.org} accompanied by mailing services, a dedicated trac as Wiki and issue tracking system for software development (extended with a mailing extension), and subversion \cite{subversion} as version control system.
+The interfacing between the KrdWrd Add-on and the Web Server is done via CGI \cite{cgi} scripts, which itself are mostly written in the Python programming language \cite{python}.
+%
+%
+\end{longversion}
diff --git a/320.tex b/320.tex
new file mode 100644
index 0000000..3020de7
--- /dev/null
+++ b/320.tex
@@ -0,0 +1,157 @@
+\begin{longversion}
+%
+%
+Generally, pre-processing is the first step to streamline external data for further processing in a customised data-processing pipeline, and in the KrdWrd System it constitutes harvesting data, i.e.~grab Web pages off the web, convert them into UTF-8 encoding \cite{unicode.org}, make links on these pages relative \cite{w3.org/base}, and compile them into a corpus that can be tagged by users.
+
+
+% What, (Why,) How, Result
+\subsection{URL List Generation}
+
+For downloading pages off the Web the KrdWrd System needs to be told \emph{which} pages to grab. But because we are interested in a wide spectrum of layouts we need to scatter the URLs to be fetched over different web sites, i.e.~we are not interested in having a small number of site URLs and then recursively grab these sites but we want a large number of URLs from different sites.
+
+To this end we utilise the BootCaT toolkit\cite{BaroniSilvia2004} to construct an ad-hoc URL list:
+a set of seed terms is used for automated queries against a Web search engine, the top results for querying random combinations of the terms are downloaded and analysed, i.e.~unigram term counts from all retrieved Web pages are compared with the corresponding counts from a reference corpus. 
+In the last step\footnote{Though, this loop can be repeated multiple times with unigram term counts until the corpus of retrieved Web pages reaches a certain size or matches other characteristics.} multi-word terms are extracted and used a seed terms for the query process.
+However, we used the top results from these last multi-word queries as URL List.
+
+
+\paragraph{en d\'{e}tail:}
+We used the BootCaT installation of the Institute of Cognitive Science's Computational Linguistics group at the University of Osnabr\"{u}ck\cite{ikw/CL} -- at the time of writing this was the initial version with the updates from February 2007.
+
+The topic of the seed terms for the BootCaT procedure was ``Nuremberg in the Middle Ages'', the terms were:
+\emph{
+        history,
+        coffee,
+        salt,
+        spices,
+        trade road,
+        toll,
+        metal,
+        silk,
+        patrician,
+        pirate,
+        goods,
+        merchant}, 
+the Internet search engine BootCaT used was Yahoo!\cite{yahoo}, 
+the reference corpus was the British National Corpus (BNC)\footnote{The data was obtained under the terms of the BNC End User Licence. For information and licensing conditions relating to the BNC, please see the web site at \url{http://www.natcorp.ox.ac.uk}}, 
+and the procedure resulted in 658 URLs from unique domains; 
+note that we departed from the original BootCaT recipe and only allowed one URL per domain.  
+This URL list was passed on to the KrdWrd Harvester -- but, of course, \emph{any} URL list can be feed to the Harvester.
+
+%le fin:
+\noindent \linebreak
+The seed terms, the command sequence, and the URL list can be found at \url{https://krdwrd.org/trac/browser/tags/harvest/canola}.
+
+
+% What, (Why,) How, Result
+\subsection{The KrdWrd App: Harvesting Mode}
+
+The automated downloading of Web content is done by the KrdWrd App in harvesting mode, namely by feeding the App an URL list as input and have it then fetch and store downloaded content for further processing. Moreover this process resolves three significant concerns:
+\begin{description}
+\item[Enforce UTF-8 Character Encoding] for grabbed documents -- character encoding has been the cause for much hassle in data processing, and to eliminate it -- or at least reduce it to a minimum -- we transform \emph{every} document into UTF-8 encoding \cite{unicode.org} and make sure that successive processing steps are UTF-8 aware.
+\item[Change the <BASE> Element] for grabbed documents (or insert one) \cite{w3.org/base, w3.org/Addressing} -- for smooth integration into the KrdWrd system  we change this attribute such that relative URIs are resolved relative to our system. 
+\item[Surround Text with <KW> Elements] in grabbed documents -- these additional elements splits up text: when large amounts of text fall under a single node in the DOM tree, i.e.~when the whole text can only be selected as a whole, these elements loosen this restriction but, on the other hand, do not affect the rendering of the Web page or other processing steps.     
+\end{description}
+
+Finally, the System extracts the textual content of each page and only considers documents of certain text length as appropriate for further processing and discards all others. The rational is that \emph{very} short and \emph{very} long web pages rarely contain useful samples of interesting running text.
+
+\paragraph{en d\'{e}tail:}
+We used the previously generated URL list and fed it to the KrdWrd App in harvesting mode, which then retrieved Web pages via the KrdWrd Proxy (see \ref{sec:proxy}) just as if someone operating a Firefox browser had viewed them. 
+The textual length restriction was set to only allow for a \emph{decent} amount of text, which we thought holds for documents consisting of 500 to 6,000 words\footnote{We used the linux \texttt{wc}\cite{wc} command, i.e.~a word is a string of characters delimited by white space characters.}. 
+Finally, we manually inspected the remaining grabbed pages for problems arising from limitations -- and had to discard two files. 
+Overall, the process resulted in 228 pages that were considered for further processing. 
+
+%le fin:
+\noindent \linebreak
+The currently used 'harvester' can be found at \url{https://krdwrd.org/trac/browser/trunk/src/app/harvest.sh}.
+
+
+% What, (Why,) How, Result
+\subsection{\label{sec:proxy}The KrdWrd Proxy} 
+
+The KrdWrd Harvester and the KrdWrd Add-on make all Internet connections through the KrdWrd Proxy.
+This storage fills up with the harvested Web pages but also with all directly-linked material, which is included via absolute or relative links, or e.g.~\emph{generated} by scripts.
+Often, this `additional' material is considered superfluous and therefore discarded; 
+moreover, the non-textual content of Web pages is often stripped off -- or the textual or structural content altered. 
+See e.g.~\cite{PotaModule,cleaneval/annotation_guidelines}, or more generally \cite{WAC2,WAC3,WAC4}. 
+
+Unfortunately, this renders it very difficult -- or even impossible -- to compare work in cases where one utilises data that is not available any more or only in an altered form.
+This is to say indeed, in the end we \emph{also} want text but with different requirements of competing systems the base material must be pristine, i.e.~the most `natural' and the least modified version of data should be conserved. 
+To this end, we utilise the World Wide Web Offline Explorer (wwwoffle)\cite{wwwoffle} as a proxy, which can be operated in two modes: \emph{online} and \emph{offline}. 
+
+\paragraph{wwwoffle Online Mode} allows for 
+caching of pages that are downloaded for later review, 
+use with one or more external proxies, 
+control over which pages cannot be accessed and 
+which pages are not to be stored in the cache.
+
+\paragraph{wwwoffle Offline Mode} allows for
+use of normal browser to follow links,
+control which pages can be requested, and
+for non-cached access to Intranet servers. 
+
+\paragraph{wwwoffle generally} allows for a
+searchable cache index with the addition of included programs, and
+viewable indexes sorted by name, date, server domain name, type of file.
+The configuration is done in a single configuration file, which can be accessed via an interactive web page to allow editing; user customisable error and information pages are also easily configurable. 
+
+\noindent\linebreak
+During pre-processing the KrdWrd Online Proxy is used; it runs as a daemon and responds only to internal requests but material that is downloaded in online mode will be available for requests in offline mode.
+
+The KrdWrd Offline Proxy runs as a daemon and responds to network requests from the Internet, it is publicly available\footnote{with the exception that there exists a \emph{dummy} login\ldots}, and can be accessed via \url{proxy.krdwrd.org:8080}. 
+This proxy does not fetch new pages into the KrdWrd Cache, i.e.~all Web page request coming from the client computer, e.g.~from a user surfing the net with an installed and enabled KrdWrd Add-on, will be filtered and only requests for content that had previously been downloaded in online mode will be allowed. 
+The offline mode is automatically configured by the KrdWrd Add-on .
+
+The Proxy data pool holds unmodified, re-loadable (near\footnote{Dynamically generated links are challenging and may lead to missing content.}) copies of all the Web pages from within the KrdWrd Corpus. 
+
+\paragraph{en d\'{e}tail:}
+We set-up and configured two instances of wwwoffle on the KrdWrd Host;
+one publicly available, operating in offline mode and constituting the KrdWrd Offline Proxy, and one for use by the KrdWrd Harvester, operating in online mode and constituting the KrdWrd Online Proxy. 
+The two instances are operational at the same time and they share the same data pool; this is easily possible and does not result in data inconsistencies because the offline proxy only reads data from the pool -- it never writes data to the pool. Additionally, we configured the online proxy to \emph{never} re-grab material, i.e.~the first encounter of new content will be the one the systems keeps.
+
+%le fin:
+\noindent \linebreak
+The currently used configuration can be found at \url{https://krdwrd.org/trac/browser/trunk/src/utils/wwwoffle}.
+
+%At first glace the krdwrd project and projects archiving Web content have a lot in common with search engines: they all are in need of a high performance web-crawler, i.e.~a program for autonomously downloading vast amounts of web pages;
+%However, whereas crawlers for archive and search engine projects given a set s of seed \emph{Uniform Resource Locators} (URLs), repeatedly remove URLs from the set, downloads the corresponding page, extract all the URLs contained in it, and adds any previously unknown URLs to the set, a crawler in the Web as Corpus (WaC) context only gets an inital set of URLs and downloads the corresponding pages of this set.
+
+
+%    >>> The Archive 
+%    There are many different kinds of dynamic pages, some of which are easily stored in an archive and some of which fall apart completely. When a dynamic page renders standard html, the archive works beautifully. When a dynamic page contains forms, JavaScript, or other elements that require interaction with the originating host, the archive will not contain the original site's functionality.
+
+%    Why are some sites harder to archive than others?
+
+%    If you look at our collection of archived sites, you will find some broken pages, missing graphics, and some sites that aren't archived at all. Here are some things that make it difficult to archive a web site:
+
+%        * Robots.txt -- We respect robot exclusion headers.
+%        * Javascript -- Javascript elements are often hard to archive, but especially if they generate links without having the full name in the page. Plus, if javascript needs to contact the originating server in order to work, it will fail when archived.
+%        * Server side image maps -- Like any functionality on the web, if it needs to contact the originating server in order to work, it will fail when archived.
+%        * Unknown sites -- The archive contains crawls of the Web completed by Alexa Internet. If Alexa doesn't know about your site, it won't be archived. Use the Alexa Toolbar (available at www.alexa.com), and it will know about your page. Or you can visit Alexa's Archive Your Site page at \url{http://pages.alexa.com/help/webmasters/index.html#crawl_site}.
+%        * Orphan pages -- If there are no links to your pages, the robot won't find it (the robots don't enter queries in search boxes.)
+
+%    As a general rule of thumb, simple html is the easiest to archive.
+%    <<<
+
+%    there are three frameworks worth mentioning -- we explixitly only talk about building archives of Web content and not, e.g.~about building a cache for performace reasons, or about caching content for traffic analyses or debuigging pruposes.
+
+%    archive org:
+%    http://webteam.archive.org/confluence/display/Heritrix/Home
+%    why not use a well-developed tool to build up a local cache? (well, because - for the time being - wwwoffle does the job and was easier to set-up. but using archive.org's stuff should be considered...)
+%
+%    http://www.httrack.com/ and (in particular) http://www.httrack.com/proxytrack/
+%    interesting approach, available on many platforms, integrated klicki-bunti thingything, etc. ...and why not use this one? hm, cf. further up... 
+
+
+%    A very interesting aproach in this respect is the Internet Archive at [http://www.archive.org] whose purpose is to build a digital library of Internet sites -- this includes archiving the same site at different points in time. 
+%    The Archive develops software tailored to their needs, e.g.~the Heritrix archival crawler [\url{http://crawler.archive.org/}] "is the Internet Archive's open-source, extensible, web-scale, archival-quality web crawler project."  
+%    , which could suit the needs for the Web as Corpus (WaC) community. However, our inital evaluation of was that the main goal of The Archive is 
+
+%    "Our breadth-first crawl together with heuristics for normalizing URLs prevents us from going too deeply into uninteresting data." [ Report 174 Towards Web-scale Web Archaeology
+%    by Shun-tak A. Leung, Shun-tak A. Leung, Sharon E. Perl, Sharon E. Perl, Raymie Stata, Raymie Stata, Janet L. Wiener, Janet L. Wiener ]
+%
+%    [Known HTTP Proxy/Caching Problems (http://tools.ietf.org/html/rfc3143)]
+%
+%
+\end{longversion}
+
diff --git a/330.tex b/330.tex
new file mode 100644
index 0000000..494a481
--- /dev/null
+++ b/330.tex
@@ -0,0 +1,117 @@
+\begin{longversion}
+%
+%
+The pre-processed data is now ready to be processed by annotators, and we will present the setting in which the annotated data, the foundation for the gold standard, was acquired.
+
+The KrdWrd System incorporates the KrdWrd Add-on, an extension for the Firefox browser, which facilitates the visual tagging of Web pages. 
+However, users also need to be told \emph{what} to tag \emph{how} -- therefore, a refined version of the official `CLEANEVAL: Guidelines for annotators' \cite{cleaneval/annotation_guidelines} is provided, and -- additionally -- users are encouraged to work through a small tutorial to get acquainted with different aspects of how to apply the guidelines to real-world Web pages.
+The snag of finding people to actually put the system into use was kindly solved by the lecturers of the \emph{Introduction to Computational Linguistics} class of 2008 from the Cognitive Science Program at the University of Osnabr\"{u}ck by means of an homework assignment for students.
+
+
+% What, (Why,) How, Result
+\subsection{\label{sec:addon}The KrdWrd Add-on: An Annotation Platform}
+The KrdWrd Add-on receives data from the server, modifies the rendering of Web pages by highlighting selected text, supports the tagging of different parts of a page differently, and finally, sends an annotated page back to the server for storage and subsequent processing.
+
+It extends the functionality of the Firefox browser with a status-bar menu where -- beside some administrative tasks -- the user may choose to put the current browser tab into \textit{tracking mode}.
+In this mode pre-defined colour coded tags are integrated into the familiar view of a Web page
+A)~to highlight the part of the page where the mouse is hovering over and thereby, is subject to tagging, and
+B)~to highlight the already tagged parts of the page.
+
+The annotation process is straightforward (cf.~figure \ref{fig:addon} for a partly annotated page): 
+\begin{enumerate}
+\item Users move the mouse over the Web page and the block of text \emph{under} the mouse pointer is highlighted
+(Sometimes this block will be rather small, sometimes it may cover large portions of text),
+\item Users assign tags to the highlighted blocks by either using assigned keyboard shortcuts or via entries in the context menu (Afterwards, these blocks stay coloured in the respective colours of the assigned tags),
+\item Users submit the page, i.e.~the Web page \emph{and} the incorporated tags are transfered to the server -- this is done by pressing a shortcut or via an entry in the status-bar menu  
+(The tagged page, or a partly tagged page, for that matter, can be re-submitted to the server), and
+\item The KrdWrd System serves a new, untagged page for tagging\footnote{This new page is randomly selected among the set of pages with the lowest count of aggregated submissions per user, i.e.~at large, the submissions will be evenly distributed over the corpus -- but cf.~\ref{fig:submsperpage}}.
+\end{enumerate}
+
+\fig{.80}{addon2}{We used the lovely colour fuchsia to highlight the part of the page where the mouse is hovering over, and the colours red, yellow, and green\footnotemark~for the already tagged parts, where red corresponded to \emph{Bad}, yellow to \emph{Unknown}, and green to \emph{Good} (cf.~\ref{sec:guidelines} for details).}{fig:addon} 
+\footnotetext{\textcolor{fuchsia}{fuchsia} -- there is a short story behind this color: \url{https://krdwrd.org/trac/wiki/KrdWrd} -- \textcolor{red}{red}, \textcolor{yellow}{yellow}, and \textcolor{green}{green}, respectively}
+Furthermore, the KrdWrd Add-on is accompanied by a manual \cite{krdwrd.org/manual}.
+It explains how to install the Add-on, get started with tagging pages, how to actually tag them, i.e.~it includes the annotation guidelines, and also gives some tips \& tricks on common tasks and problems.
+
+%le fin:
+\noindent \linebreak
+The Add-on is available from \url{https://krdwrd.org/trac/wiki/AddOn}.
+
+
+% What, (Why,) How, Result
+\subsection{\label{sec:guidelines}The KrdWrd Annotation Guidelines}
+The KrdWrd Annotation Guidelines specify which tag should be assigned to particular kinds of text. 
+We used the CleanEval (CE) annotation guidelines as a start (cf. \cite{cleaneval/annotation_guidelines}), and made a few substantial changes however, because we realised that there were several cases in which their guidelines were insufficient.
+
+The most important change we made was the addition of a third tag `uncertain' whereas originally, only the two tags `good' and `bad' were available.
+It had soon become apparent that on some Web pages there were passages that we did not want to be part of a corpus (i.e.~that we did not want to tag `good') but that we did not want to throw out altogether either (i.e.~tag them as `bad'). 
+We also decided to tag all captions as `uncertain'. 
+
+Another rationale behind this introduction of this third tag was that we might want to process this data at a later stage.
+Also note that in \cite{SpoustaMarekPecina2008} other CE participants also used a three-element tag set.
+
+We adopted the following guidelines from the CE contest, and all of these items were supposed to be tagged `bad':
+
+\begin{itemize}
+        \item Navigation information
+        \item Copyright notices and other legal information
+        \item Standard header, footer, and template material that are repeated across (a subset of) the pages of the same site
+\end{itemize}
+
+We modified the requirement to clean Web pages of internal and external link lists and of advertisement slightly: 
+The KrdWrd Guidelines state that \textit{all} \mbox{(hyper-)links} that are \textit{not} part of the text are supposed to be tagged as `bad'. 
+This, of course, includes link lists of various kinds, but preserves links that are grammatically embedded in `good' text. 
+We also restricted ourselves as to discard advertisement from \textit{external} sites only. 
+Some of the pages were pages about certain products, i.e.~advertisement, but we did not want to exclude these texts (if they fulfilled our requirements for `good' text, as defined below).  
+
+The two sorts of text, we did not exclude specifically (as the CE guidelines did), were Web-spam, such as automated postings by spammer or blogger, and cited passages. 
+Instead, we required `good' text to consist of complete and grammatical English sentences that did not contain `non-words' such as file names. 
+That way, we filter out automatically generated text \textit{only if} it is not grammatical or does not make up complete sentences, and keep text that can be useful for information extraction with statistical models.
+
+Our refined annotation guidelines still leave some small room for uncertainties (but probably \textit{all} such guidelines suffer from this problem). 
+We are optimistic, however, that they are a clear improvement over the original CE guidelines and that our Web corpus will only contain complete and grammatical English sentences that contain `normal' words only.
+
+%le fin:
+\noindent \linebreak
+The annotation guidelines are available from \url{https://krdwrd.org/manual/html/}.
+
+
+\subsection{\label{sec:tutorial}The KrdWrd Tutorial: Training for the Annotators}
+For initial practice, we developed an interactive tutorial that can be completed online (as feature of an installed Add-on).
+
+The interactive tutorial can be accessed from the status bar by clicking `Start Tutorial', and is designed to practice the annotation process itself and to learn how to use the three different tags correctly. 
+Eleven sample pages are displayed one after another, ranging from easy to difficult (these are the same samples as in the 'How to Tag Pages' section of the manual).
+
+The user is asked to tag the displayed pages according to the guidelines presented in the manual. 
+We inserted a validation step between the clicking of `Submit' and the presentation of the next page, giving the user feedback on whether or not she used the tags correctly. 
+Passages that are tagged in accordance with our annotations are displayed in a light-coloured version of the original tag, i.e.~text correctly tagged as `bad' will be light-red, `good' text will be light-green, and text that was tagged correctly as `uncertain' will be light-yellow.
+The passages with \textit{differing} annotations are displayed in the colour in which they should have been tagged, using the normal colours, i.e.~saturated red, green, and yellow.
+After clicking `Next Page' on the right top of the screen, the next page will be shown.
+
+If a user should decide to quit the interactive tutorial before having tagged all eleven sample pages, the next time she opens the tutorial, it will begin with the first of the pages that have not been tagged, yet. 
+And should a user want to start the tutorial from the beginning, she can delete previous annotations via `My Stats' in the status bar. 
+Then, the next time the tutorial is opened it will start from the very beginning. 
+By pressing `Start Tutorial' in the status bar during the practice and \textit{before} the submission of the current page, that same page will be displayed again, un-annotated.
+When using `Start Tutorial' \textit{after} a page's submission and before clicking `Next Page' in the notification box at the top, the next page of the tutorial will be shown.
+
+As stated above, it is our goal that the interactive tutorial will help users getting used to the annotation process, and we are also optimistic that it helps understanding and correctly applying the tagging guidelines as presented in the manual.
+
+%%le fin:
+%\noindent \linebreak
+%The tutorial is only available as part of an installed Add-on. 
+
+
+\subsection{\label{sec:assignment}The KrdWrd Assignment: A Competitive Shared Annotation Task}
+Finally, our efforts were incorporated into an assignment for the class `Introduction to Computational Linguistics' where -- from a maximum number of 100 students -- 68 completed the assignment, i.e.~their effort was worth at least 50\% of the assignment's total regular credits.
+The assignment was handed out 7, July, was due 18, July 2008, and consisted of two exercises:
+\begin{enumerate}
+        \item The first task was to complete the interactive online tutorial, i.e.~the students had to go through the eleven sample pages, annotate them, and -- ideally -- think about the feedback. This task was worth 20\% of the credits.
+        \item The second task was to tag pages from our assembled corpus; 15 tagged pages were worth 80\% of the credits and 10 additional pages were worth an extra that was counted towards the credits of all other homework assignments, i.e.~students could make up for `lost' credits\footnote{As a matter of fact, 43 students received the total of 100\% regular credits + 100\% extra credits.}.
+\end{enumerate}
+
+%le fin:
+%\noindent \linebreak
+%The assignment is enclosed in the appendix (cf.~\ref{cha:appendix}). 
+%
+%
+\end{longversion}
+
diff --git a/340.tex b/340.tex
new file mode 100644
index 0000000..1b14cba
--- /dev/null
+++ b/340.tex
@@ -0,0 +1,146 @@
+\begin{longversion}
+%
+%
+The data for the gold standard was collected via the KrdWrd Add-on (cf.~\ref{sec:addon})
+as an homework assignment (cf.~\ref{sec:assignment}) for a Computational Linguistics class,
+which is a second year undergraduate Cognitive Science class at the University of Osnabr\"{u}ck.
+The Annotators were introduced to the KrdWrd Annotation Guidelines (cf.~\ref{sec:guidelines}) by means of the KrdWrd Tutorial (cf.~\ref{sec:tutorial}), and 
+were supposed to work independently (e.g.~from their home PC) though, could have sat near each other. 
+However, we did take precautions against na\"{i}ve copying by enforcing authentication for the users with their student accounts,
+hiding other users' results, and
+serving random pages for tagging -- thus, even if students were exchanging information it could rather have been about the assignment and the tagging in general than a specific Web site in particular.
+
+\subsection{Initial Observations}
+From 100 student subscribed to the class 69 installed the Add-on and submitted at least one page (not necessarily a tagged one, though\ldots).
+This was also about the ratio of students who took the final exam for this course hence, we can say that almost every students seriously interested in finishing this class also took the homework assignment.
+
+The majority of submissions came within the last 4 days of the period of time they were granted to finish the assignment -- with a major peak at the last day;
+which, according to all we know, is quite common.
+This has probably also led to only very few people making use of the re-submit feature, i.e.~continuing or modifying an already submitted page.
+
+The possibility to interact with the KrdWrd Team, e.g.~to solve installation problems, or to exchange information via an e-Mail list we had set up for this purpose was rarely used (cf.~\url{https://krdwrd.org/trac/mail/threads}).
+The few reported problems however, led to some beneficial improvements of the documentation.
+
+\paragraph{Our initial Data Set} (before further clean-up):
+228 Web pages, consisting of almost 440,000 words and over 2.6 million characters, were independently processed by
+69 users who submitted 
+1767 results (re-submits for a page counted only once), which is an average of 7.75 submissions per page. 
+
+
+% What, (Why,) How, Result
+\subsection{The KrdWrd App: Annotations Merging Mode}
+
+The KrdWrd App in merging mode compares the initially grabed \emph{master} with the user submitted results and, for every text node in the DOM tree, computes a majority vote and assigns this as the gold standard tag to the node of a newly created document. 
+
+The process is carried out offline on the server:
+the input is one URL of a master document and the URLs of the respective user submitted results.
+After reading all documents the DOM trees of all documents are traversed top-down and tags along the traversal are propagated further down as long as, 
+no more specific tag is encountered,
+i.e.~a tag can not overwrite another one further down the path but is \emph{pushed down} as far as possible (cf.~figure \ref{fig:propagation} for an illustration).
+At the end of each path in the tree the assigned tags are counted.
+After having traversed all documents a sanity check is carried out\footnote{
+We also implemented another sanity check namely, to check whether the textual content in the nodes is identical but dropped this condition -- mainly because the few encounters were false positives and it had negative impact on performance as well.
+} \footnote{
+The overall handling of JavaScript is not satisfactory.
+To address the diversions between submits occurring after dynamic client-side JavaScript execution on different clients, the Add-on could hook into the node creation and clone processes.
+They could be suppressed entirely or newly created nodes could grow a special id tag to help identifying them later.
+} namely, are there documents which still have unseen nodes, or are there documents which had less nodes than the master document? 
+In either case, these submissions are discarded from further processing.
+
+The remaining submissions are taken into account for the majority vote on each node of the master document.
+Another document is generated, which includes the resulting tags.
+
+\begin{figure}[htb]
+    \centering
+    \begin{minipage}[c]{.9\textwidth}
+        \includegraphics[width=.48\textwidth,keepaspectratio]{preprop}
+        \hfill
+        \includegraphics[width=.48\textwidth,keepaspectratio]{postprop}
+    \end{minipage}
+    \caption{On the left is the un-propagated page with major parts having been tagged green and red. On the right is the propagated version where the green has been pushed down into the text nodes; the same holds for red but note that the heading in yellow has not been overwritten.}
+    \label{fig:propagation}
+\end{figure}
+
+
+% What, (Why,) How, Result
+\subsection{Merge Analysis}
+
+Before we started to analyse the results of the merging process we excluded the results of one user who had only submitted one page. 
+Then, the merging process revealed the following problematic cases (usually by rejecting user results on grounds of the sanity check): 
+%2: 690, 870
+2 pages with no results left to merge,
+%3: 708, 715, 908 (708:bad, rest ok)
+3 pages with only one result to merge,
+%2: 714, 720 (714: poor, 720: bad)
+2 more pages with only two result to merge,
+%1: 746
+1 page with four results to merge, and 
+%1:?
+1 page that could not be merged due to an error in our application\footnote{We fixed the error but this rendered the submitted pages unusable -- newly submitted pages will be mergable.}.
+We also excluded all these cases from further processing (cf.~figure \ref{fig:submsperpage}).
+
+\fig{.9}{submsperpage_hist}{Number of Pages with x Submissions - the dividing line at \emph{5 Submissions} shows the cut-off, i.e.~pages with less then 5 Submissions were excluded from further processing. The observant reader may notice that we said the annotations were evenly distributed: this is the case, now. We had not turned on this feature when we started collecting the data, however.}{fig:submsperpage}
+
+We continued to do a plausibility check for the submitted results:
+we computed a \emph{tag-bias} for each user, where we compared each user's tendency to chose a tag for a node with the actual winning tags for nodes. 
+This computation revealed 
+% 4: 14, 34, 31, 74
+4 cases in which users showed strong biases towards certain tags\footnote{Manual inspection of these cases showed that the users obviously only wanted to raise their tagged-pages count and therefore, just tagged very few nodes -- typically high up in the DOM tree -- which were then propagated downwards.}.
+We also excluded all results from these users.
+
+% BADUIDS=(14 31 34 73 74)
+\paragraph{The resulting and final Data Set:}
+219 Web pages, consisting of more than 420,000 words and over 2.5 million characters, were independently processed by
+64 users who submitted 
+1595 results (re-submits for a page counted only once), which is an average of 7.28 submissions per page. 
+
+\noindent \newline
+We continued our analyses of this new data at hand, and looked into the timestamps we collected for the submissions:
+therefore, we summed up all the deltas between two submissions for each user and calculated the duration each user \emph{saw} a single page;
+then, we computed a reasonable upper bound for how log a submit action might take,
+i.e.~the hypothesis was that page-view times longer than a certain amount of time were actually breaks.
+To this end, we detected outliers\footnote{This is quite standard: $x$ values outside the range $Q1 - 1.5*IQR < x < 1.5*IQR + Q3$ were considered outliers.} and discarded \emph{all} respective submissions (the calculated\cite{r-project} result was 700s).
+
+The calculated time data suggests that:
+\begin{itemize}
+\item the majority of users spent between 56 and 88 minutes on the assignment with an average of 71 minutes (cf.~figure \ref{fig:timespentonassignment} for details), 
+\item average per-page annotation time drops below three minutes (cf.~figure \ref{fig:timespentonpage}), and
+\item the first pages after the tutorial are still more challenging than later ones (cf.~\ref{fig:sequencedelta}).
+\end{itemize}
+
+\fig{.9}{timespentonassignment}{Time in Minutes spent by y Users on the Assignment, i.e.~how much Time did a User interact with the Add-on to tag her share of the Canola Corpus.}{fig:timespentonassignment}
+% Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+% 11.92   56.03   69.52   70.64   88.10  140.70
+
+\fig{.9}{timespentonpage}{Minutes spent on a single Page accross all annotations of the Canola corpus.}{fig:timespentonpage}
+% Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+% 28.6   121.9   172.2   168.5   200.0   324.2 
+
+\fig{.55}{sequencedelta}{Smoothened average of differences in seconds between annotation times of all users at Position x in their specific sequences of Web Pages and the mean of all other users who processed identical pages at a later time in their respective sequences.}{fig:sequencedelta}
+
+
+%length:
+% - num word/nodes does not correlate to time for page - only for quite short documents
+% - long time spent on documents: long /rendered/ documents - but also one of the quickest and best tagged documents is among them\ldots
+%
+% 716(took the longest) 697 696 815 774 ||  888 765 769 910 767(shortes)
+% https://krdwrd.org/pages/bin/view/[NUMBER]
+
+%\subsection{Agreement and Disagreement Analysis of Annotations}
+
+\fig{.9}{pagesperuser}{Aggregated counts for the Number of Users who processed \emph{at least} x Number of Pages. Note the two steps at 15 and 25 pages, which correspond to the obligatory and the optional number of pages in the assignment. Also note that there were quite many students who went far beyond the requirement for the assignment.}{fig:pagesperuse}
+
+For the overall inter-coder agreement of the remaining submissions we calculated Fleiss's multi-$\pi$ as layed out in \cite{ArtsteinPoesio2008}: for each Web page the remaining submissions were set as coders, and the tagged DOM nodes as items -- the three categories were fixed.
+This resulted in an average inter-coder agreement over all pages of 0.85 (cf.~\ref{fig:agreementonpages}), which we think is -- at least -- substantial.
+Considering that these submissions were the basis for the merge process we believe that the Canola Gold Standard Corpus is a solid basis for further processing.
+Furthermore, this metric could be used for comparison of cleaning results in general -- maybe normalised for the number of words or characters per DOM node.
+
+\fig{.9}{agreementonpages}{Inter-coder agreement between submissions for pages over the Canola corpus.}{fig:agreementonpages}
+% Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+% 0.4166  0.7508  0.8585  0.8260  0.9246  1.0000
+
+\paragraph{Remark:} we looked into the pages at the lower end of the agreement spectrum and found that they tended to be quite long and were often discussion forum pages, i.e.~with many alterations in the tags that were to assign. Given that similar shorter pages achieved better results, it seems that even our already quite low boundary of 6,000 words per page resulted in pages that were frustrating to process.
+%
+%
+\end{longversion}
+
diff --git a/350.tex b/350.tex
new file mode 100644
index 0000000..ee967a7
--- /dev/null
+++ b/350.tex
@@ -0,0 +1,130 @@
+\begin{longversion}
+%
+%
+- frameworks 4 tagging
+- gold: welche seiten entfernt
+
+\paragraph{What was done}
+\begin{itemize}
+\item A Firefox add-on: receive data from the server and send it back  
+\item A server: provide pages to clients and receive tagged pages (from the individual users)
+\item Refined annotation guidelines: incorporate feed-back from CleanEval-1 taggers
+\item Manual for the add-on: provide a means of getting to know the tool
+\item Interactive online tutorial: provide a means to get hands-on experience using the add-on while applying the tagging guidelines
+\item An I2CL assignment: gather Gold Standard annotations for Web pages  
+\end{itemize}
+
+\paragraph{What was used}
+\begin{itemize}
+\item A shared Debian/GNU Linux Server with a shared Apache Web server
+\item A dedicated Trac, i.e.~an enhanced wiki and issue tracking system for software development projects
+\item A dedicated svn, i.e.~an open-source revision control system for all documentation and all programm code
+\item A WWWOffle proxy server to keep a (quite) pure version of the pages to be tagged
+\item A XULRunner application to harvest the pages \textit{through} the proxy -- as if they were viewed by a user
+\item JavaScript, Python, Perl, Bash-scripting for the necessary front- and back-ends
+\item A SQLite3 self-contained, embeddable, zero-configuration SQL database engine for the (less) pure version of the pages to be tagged and the users' annotations
+\item \ldots
+\end{itemize}
+
+
+
+\paragraph{Bonus: FIASCO Corpus}
+    The FIASCO corpus was compiled from the original ``Osnabr\"{u}ck Cleaneval Gold Standard'' [Bauer et al. Sec.4], by combining the re-alignment method from then with the regular \nobreak{KrdWrd} system pipeline. The re-alignment was necessary because [Bauer et al.]
+    \begin{quote}
+    [\ldots] made the mistake of using the same annotation strategy as the official
+    CLEANEVAL data set. This meant that all internal structural information of the HTML
+    document was lost in the manual cleanup process (since the cleanup started from text
+    dumps of the pages) and the gold standard was therefore not directly usable as training
+    data for [their] machine-learning algorithm.
+    \end{quote}
+
+    The original 158 wget\cite{wget} dumped files\footnote{available from \url{https://krdwrd.org/pages/dat/fiasco_en/}} were copied to an \emph{external} host and their locations were used as URL List, i.e.~the files and all the embedded or linked content was fetched through the online-proxy -- and was then available via the off-line proxy.
+    However, instead of adding the corpus to the KrdWrd system the files were processed along the original re-alignment pipeline.
+    The HTML output of the KrdWrd system was converted to valid XHTML using the open source utility TagSoup\url{http://ccil.org/~cowan/XML/tagsoup/} -- a SAX parser written in Java and designed to process even extremely malformed HTML sources.
+    In addition, some rule based cleanup was done, viz.~inline Javascript (marked with the \texttt{<script>} tag), style information and HTML comments were removed.
+    Finally, paragraphs not contained in proper \texttt{<p>} elements but delimited by line breaks or marked with empty \texttt{<br/>} tags were replaced with proper paragraph elements.
+    Needless to say that these steps -- other than following the original pipeline -- are not necessary for the KrdWrd system.
+
+    The resulting XHTML files were automatically\footnote{The algorithm first extracted the textual content under each node of the document's DOM tree.
+    It then compared paragraphs in the cleaned text with nodes in the DOM tree by calculating Dice coefficients measuring word overlap between each paragraph and the textual content below each tree node.
+    If this process found a node that was sufficiently similar to a clean paragraph, the node was marked as clean.
+    The information was then propagated up and down the tree to previously selected target nodes, i.e.~nodes in the DOM tree were 10\% of their -- including all their children's --  textual content was contained in text nodes among its \emph{immediate} children, in the following way:
+    all target nodes contained in a clean subtree were marked as clean;
+    all target nodes that dominated more clean than non-clean nodes were marked as clean, too.
+    All remaining target nodes were marked as dirty.} aligned with the manually annoteated text files and the result was transformed for the KrdWrd system namely,
+    dirty nodes were marked as ``Junk'', clean nodes were marked as ``Good'', and all others were marked as ``Uncertain''.
+    %translation matrix (though it's not quite clear where these 'two' alignments come from):
+    %clean='1' -> krdwrd-tag-2 (uncertain)
+    %clean='2' & target='1' -> krdwrd-tag-3 (good)
+    %clean='3' -> krdwrd-tag-3 (good)
+    The output obtained in this way is not perfect but for many pages it works quite well and some manual refinement is easily possible.
+
+
+    original 158 file
+    now 134 files
+
+    missing files
+
+
+    cf. \url{http://en.wikipedia.org/wiki/Character_encodings_in_HTML}
+    \begin{verbatim}
+    documents in the corpus are stripped of their root element to avoid - at least some - (en- and) decoding difficulties: when requesting HTML documents from a server the encoding  can be defined in three ways (and we found instances where the encoding was defined multiple times - with diferent content). 
+    - as part of the HTTP-response (Content-Type:)
+    - as part of the <HEAD> element
+    - as part of the XML preamble (for XHTML only)
+
+
+    !!! alternative text of images (img alt txt) has no initial <kw> and/or propagated krdwrd-tag
+
+    <p class='krdwrd-tag-3' target='1'>
+    <b class='krdwrd-tag-2'>Aperture:</b> Light passing through a camera lens must pass through a hole called an aperture. The aperture�[34m~@~Ys size can hanged manually or automatically by the camera. The size of the aperture affects how much light gets through the lens and onto the film or sensor. Very small apertures increase the depth of field and require longer exposure times because very little light reaches the sensor. And very large apertures decrease the depth of field and allow for shorter exposure times. Aperture is measured by a number call the <b>f-stop.</b>
+
+    \end{verbatim}
+    (harvest.py)
+    - relativize base-url
+    - discard pages with little text content
+
+
+    \begin{verbatim}
+    check baseref:
+    14_merged.xhtml.html
+    36_merged.xhtml.html
+    66_merged.xhtml.html
+    107_merged.xhtml.html
+    287_merged.xhtml.html
+
+13 (blank)
+86 (garbage text)
+212 (content missing)
+213 (content missing)
+217 (content missing)
+220 (content missing)
+222 (content missing)
+223 (content missing)
+227 (content missing)
+
+    \end{verbatim}
+
+
+\paragraph{BONUS: Victor Corpus}
+
+%
+
+%\subsection{\label{subsec:corpusannotation}Corpus Annotation}
+%the data from the DB made available by python-foo backend, proxy in off-line mode, add-on for highlighting, marking, submitting, resubmitting, etc. pages 
+
+%\subsubsection{\label{subsec:annotationinspection}Annotation Inspection}
+%corpus analysis - mostly administrative tools on the server
+%corpus analysis, stats, etc.
+
+%\subsection{\label{subsec:supervisedtraining}Supervised Training}
+%\subsubsection{\label{subsec:featureextraction}Feature Extraction}
+%- training + classification
+% - extraxtion tools
+% - learning tools
+
+%\subsection{\label{subsec:sweepingunseencontent}Sweeping Unseen Content}
+%
+%
+\end{longversion}
+
diff --git a/900bib.tex b/900bib.tex
new file mode 100644
index 0000000..2b86288
--- /dev/null
+++ b/900bib.tex
@@ -0,0 +1,23 @@
+\renewcommand\thesection{$\Omega$}
+
+\setbibpreamble{The Web sites referred to below were last accessed on March 20, 2009.
+In case of unavailability at a later time, we recommend visiting the \href{http://archive.org}{Internet Archive}.\bigskip } 
+
+\newpage
+% probably this is NOT desired
+%\nocite{*}
+
+% in case References should be namd Bibliography
+%\sectionmark{Bibliography}
+%\renewcommand\refname{Bibliography}
+
+% ...might be nice
+%\small
+\footnotesize
+
+\bibliographystyle{myalphaurl}
+\bibliography{thesis}
+\normalsize
+
+% usually this is NOT necessary; however, in some rare cases it is
+\addcontentsline{toc}{section}{Bibliography}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ea9baf6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,12 @@
+RUB:=rubber
+RUBARGS:=-I ./images -df
+
+default: canola.pdf
+
+.PHONY: canola.pdf
+
+canola.pdf: canola.tex *.tex *.bib ./images/* 
+	$(RUB) $(RUBARGS) $<
+
+clean:
+	rm -f canola.pdf canola.out canola.log canola.aux canola.bbl canola.blg canola.toc
diff --git a/canola.tex b/canola.tex
new file mode 100644
index 0000000..785a407
--- /dev/null
+++ b/canola.tex
@@ -0,0 +1,10 @@
+\newcommand{\myauthor}{Egon W.~Stemle}
+\newcommand{\mymail}{egon.stemle@unitn.it}
+\newcommand{\mytitle}{The KrdWrd CANOLA Corpus -- Gathering Training Data for Sweeping Web Pages}
+\newcommand{\myStitle}{The KrdWrd CANOLA Corpus}
+\newcommand{\mypdftitle}{\myStitle}
+\newcommand{\mykeywords}{}
+
+\newcommand{\KrdWrd}[0]{KrdWrd}
+
+\input{000inputs}
diff --git a/images/addon2.png b/images/addon2.png
new file mode 100644
index 0000000..e201a15
Binary files /dev/null and b/images/addon2.png differ
diff --git a/images/agreementonpages.pdf b/images/agreementonpages.pdf
new file mode 100644
index 0000000..0ee4664
Binary files /dev/null and b/images/agreementonpages.pdf differ
diff --git a/images/pagesperuser.pdf b/images/pagesperuser.pdf
new file mode 100644
index 0000000..32cf6dc
Binary files /dev/null and b/images/pagesperuser.pdf differ
diff --git a/images/postprop.png b/images/postprop.png
new file mode 100644
index 0000000..73dfdd2
Binary files /dev/null and b/images/postprop.png differ
diff --git a/images/preprop.png b/images/preprop.png
new file mode 100644
index 0000000..04bc13d
Binary files /dev/null and b/images/preprop.png differ
diff --git a/images/sequencedelta.pdf b/images/sequencedelta.pdf
new file mode 100644
index 0000000..9ce5358
Binary files /dev/null and b/images/sequencedelta.pdf differ
diff --git a/images/submsperpage_hist.pdf b/images/submsperpage_hist.pdf
new file mode 100644
index 0000000..37d8098
Binary files /dev/null and b/images/submsperpage_hist.pdf differ
diff --git a/images/timespentonassignment.pdf b/images/timespentonassignment.pdf
new file mode 100644
index 0000000..9a1fee7
Binary files /dev/null and b/images/timespentonassignment.pdf differ
diff --git a/images/timespentonpage.pdf b/images/timespentonpage.pdf
new file mode 100644
index 0000000..500b047
Binary files /dev/null and b/images/timespentonpage.pdf differ
diff --git a/myalphaurl.bst b/myalphaurl.bst
new file mode 100644
index 0000000..ba72e44
--- /dev/null
+++ b/myalphaurl.bst
@@ -0,0 +1,1415 @@
+%%% Modification of BibTeX style file /local2/teTeX-1.0/share/texmf/bibtex/bst/base/alpha.bst
+%%% to add webpage entry type, and url and lastchecked fields
+%%% Edits by urlbst.pl, version 0.1
+%%% (marked with % urlbst.pl)
+%%% Original headers follow...
+% BibTeX standard bibliography style `alpha'
+	% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+	% Copyright (C) 1985, all rights reserved.
+	% Copying of this file is authorized only if either
+	% (1) you make absolutely no changes to your copy, including name, or
+	% (2) if you do make changes, you name it something other than
+	% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+	% This restriction helps ensure that all standard styles are identical.
+	% The file btxbst.doc has the documentation for this style.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+    url % urlbst.pl
+    lastchecked
+  }
+  {}
+  { label extra.label sort.label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+INTEGERS { bracket.state outside.brackets open.brackets within.brackets close.brackets } % urlbst.pl
+FUNCTION {init.state.consts}
+{ #0 'outside.brackets := % urlbst.pl
+  #1 'open.brackets :=
+  #2 'within.brackets :=
+  #3 'close.brackets :=
+
+   #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+% urlbst.pl
+FUNCTION {output.nonnull.original}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+	{ add.period$ write$
+	  newline$
+	  "\newblock " write$
+	}
+	{ output.state before.all =
+	    'write$
+	    { add.period$ " " * write$ }
+	  if$
+	}
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+% urlbst.pl...
+FUNCTION {output.nonnull}
+{ % Save the thing we've been asked to output
+  's :=
+  % If the bracket-state is close.brackets, then add a close-bracket to what's
+  % currently at the top of the stack, and set bracket.state to outside.brackets
+  bracket.state close.brackets =
+    { "]" *
+      outside.brackets 'bracket.state :=
+    }
+    'skip$
+  if$
+  bracket.state outside.brackets =
+    { % We're outside all brackets -- this is the normal situation.
+      % Write out what's currently at the top of the stack, using the
+      % original output.nonnull function.
+      s
+      output.nonnull.original
+    }
+    { % Still in brackets.  Add open-bracket or (continuation) comma, add the
+      % new text (in s) to the top of the stack, and move to the close-brackets
+      % state, ready for next time (unless inbrackets resets it).  If we come
+      % into this branch, then output.state is carefully undisturbed.
+      bracket.state open.brackets =
+        { " [" * }
+        { ", " * } % bracket.state will be within.brackets
+      if$ 
+      s * 
+      close.brackets 'bracket.state :=
+    }
+  if$
+}
+
+% Call this function just before adding something which should be presented in 
+% brackets.  bracket.state is handled specially within output.nonnull.
+FUNCTION {inbrackets}
+{ bracket.state close.brackets =
+    { within.brackets 'bracket.state := } % reset the state: not open nor closed
+    { open.brackets 'bracket.state := }
+  if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+    { "" }
+    { "Available from: \url{" url * "}" * }
+  if$
+}
+
+FUNCTION {format.lastchecked}
+{ lastchecked empty$
+    { "" }
+    { inbrackets "cited " lastchecked * }
+  if$
+}
+%  ...% urlbst.pl
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ outside.brackets 'bracket.state := % urlbst.pl
+   newline$
+  "\bibitem[" write$
+  label write$
+  "]{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ 
+  bracket.state close.brackets = % urlbst.pl
+    { "]" * }
+    'skip$
+  if$
+   add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+	'skip$
+	{ after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+	{ namesleft #1 >
+	    { ", " * t * }
+	    { numnames #2 >
+		{ "," * }
+		'skip$
+	      if$
+	      t "others" =
+		{ " et~al." * }
+		{ " and " * t * }
+	      if$
+	    }
+	  if$
+	}
+	't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+	{ ", editors" * }
+	{ ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+	{ t #1 #2 substring$ "--" = not
+	    { "--" *
+	      t #2 global.max$ substring$ 't :=
+	    }
+	    {   { t #1 #1 substring$ "-" = }
+		{ "-" *
+		  t #2 global.max$ substring$ 't :=
+		}
+	      while$
+	    }
+	  if$
+	}
+	{ t #1 #1 substring$ *
+	  t #2 global.max$ substring$ 't :=
+	}
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+	{ "" }
+	{ "there's a month but no year in " cite$ * warning$
+	  month
+	}
+      if$
+    }
+    { month empty$
+	'year
+	{ month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+	'skip$
+	{ " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+	{ series field.or.null }
+	{ output.state mid.sentence =
+	    { "number" }
+	    { "Number" }
+	  if$
+	  number tie.or.space.connect
+	  series empty$
+	    { "there's a number but no series in " cite$ * warning$ }
+	    { " in " * series * }
+	  if$
+	}
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+	{ edition "l" change.case$ " edition" * }
+	{ edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+	{ #1 'multiresult := }
+	{ t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+	{ "pages" pages n.dashify tie.or.space.connect }
+	{ "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+	{ "there's a number but no volume in " cite$ * warning$ }
+	'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+	{ pop$ format.pages }
+	{ ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+	{ "chapter" }
+	{ type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+	'skip$
+	{ ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+	{ "In " booktitle emphasize * }
+	{ "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+	{ "need key or journal for " cite$ * " to crossref " * crossref *
+	  warning$
+	  ""
+	}
+	{ "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+	'skip$
+	{ editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+	    { " et~al." * }
+	    { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+	  if$
+	}
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ series empty$
+	    { "need editor, key, or series for " cite$ * " to crossref " *
+	      crossref * warning$
+	      "" *
+	    }
+	    { "{\em " * series * "\/}" * }
+	  if$
+	}
+	{ key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ booktitle empty$
+	    { "need editor, key, or booktitle for " cite$ * " to crossref " *
+	      crossref * warning$
+	      ""
+	    }
+	    { "In {\em " booktitle * "\/}" * }
+	  if$
+	}
+	{ "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+% urlbst.pl...
+% Output a URL.  We can't use the more normal idiom (something like
+% `format.url output'), because the `inbrackets' within
+% format.lastchecked applies to everything between calls to `output',
+% so that `format.url format.lastchecked * output' ends up with both
+% the URL and the lastchecked in brackets.
+FUNCTION {output.url}
+{ url empty$
+    'skip$ 
+    { new.block 
+      format.url output
+      format.lastchecked output 
+    }
+  if$
+}
+
+% Webpage entry type.
+% Title and url fields required;
+% author, note, year, month, and lastchecked fields optional
+% See references 
+%   ISO 690-2 http://www.nlc-bnc.ca/iso/tc46sc9/standard/690-2e.htm
+%   http://www.classroom.net/classroom/CitingNetResources.html
+%   http://neal.ctstateu.edu/history/cite.html
+%   http://www.cas.usf.edu/english/walker/mla.html
+% for citation formats for web pages.
+FUNCTION {webpage}
+{ output.bibitem
+  author empty$
+    'skip$
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.title "title" output.check
+  inbrackets "online" output
+  new.block
+  year empty$
+    'skip$
+    { format.date "year" output.check }
+  if$
+  lastchecked empty$
+    'skip$
+    { format.lastchecked output }
+  if$
+  new.block
+  format.url "url" output.check
+  new.block
+  note output
+  fin.entry
+}
+%   ...% urlbst.pl
+
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+	{ organization publisher new.sentence.checkb
+	  organization output
+	  publisher output
+	  format.date "year" output.check
+	}
+	{ address output.nonnull
+	  format.date "year" output.check
+	  new.sentence
+	  organization output
+	  publisher output
+	}
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+	'skip$
+	{ organization output.nonnull
+	  address output
+	}
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+	{ address new.block.checka
+	  address output
+	}
+	'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+	{ publisher new.sentence.checka }
+	{ organization publisher new.sentence.checkb
+	  organization output
+	}
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+	'skip$
+	{ organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  output.url % urlbst.pl
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+READ
+
+FUNCTION {sortify}
+{ 
+  type$ "webpage" =  
+    { skip$ }
+    { purify$ }
+  if$
+
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+INTEGERS { et.al.char.used }
+
+FUNCTION {initialize.et.al.char.used}
+{ #0 'et.al.char.used :=
+}
+
+EXECUTE {initialize.et.al.char.used}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s num.names$ 'numnames :=
+  numnames #1 >
+    { numnames #4 >
+	{ #3 'namesleft := }
+	{ numnames 'namesleft := }
+      if$
+      #1 'nameptr :=
+      ""
+	{ namesleft #0 > }
+	{ nameptr numnames =
+	    { s nameptr "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+		{ "{\etalchar{+}}" *
+		  #1 'et.al.char.used :=
+		}
+		{ s nameptr "{v{}}{l{}}" format.name$ * }
+	      if$
+	    }
+	    { s nameptr "{v{}}{l{}}" format.name$ * }
+	  if$
+	  nameptr #1 + 'nameptr :=
+	  namesleft #1 - 'namesleft :=
+	}
+      while$
+      numnames #4 >
+	{ "{\etalchar{+}}" *
+	  #1 'et.al.char.used :=
+	}
+	'skip$
+      if$
+    }
+    { s #1 "{v{}}{l{}}" format.name$
+      duplicate$ text.length$ #2 <
+	{ pop$ s #1 "{ll}" format.name$ #3 text.prefix$ }
+	'skip$
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ key empty$
+    { author empty$
+	{ cite$ #1 #3 substring$ }
+	{ author format.lab.names }
+      if$
+    }
+  { key #5 text.prefix$ }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ key empty$
+    { author empty$
+	{ editor empty$
+		{ cite$ #1 #3 substring$ }
+		{ editor format.lab.names }
+	  if$
+	}
+    	{ author format.lab.names }
+      if$
+    }
+    { key #3 text.prefix$ }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+	{ organization empty$
+	    { cite$ #1 #3 substring$ }
+	    { "The " #4 organization chop.word #3 text.prefix$ }
+	  if$
+	}
+	{ key #3 text.prefix$ }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+	{ organization empty$
+	    { cite$ #1 #3 substring$ }
+	    { "The " #4 organization chop.word #3 text.prefix$ }
+	  if$
+	}
+	{ key #3 text.prefix$ }
+      if$
+    }
+    { editor format.lab.names purify$ }
+  if$
+}
+
+FUNCTION {calc.label}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+	'editor.key.organization.label
+	{ type$ "manual" =
+	    'author.key.organization.label
+	    'author.key.label
+	  if$
+	}
+      if$
+    }
+  if$
+  duplicate$
+  year field.or.null purify$ #-1 #2 substring$
+  *
+  'label :=
+  year field.or.null purify$ #-1 #4 substring$
+  *
+  sortify 'sort.label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+	{ "   " * }
+	'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+	{ "et al" * }
+	{ t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+	{ "to sort, need author or key in " cite$ * warning$
+	  ""
+	}
+	{ key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+	{ key empty$
+	    { "to sort, need author, editor, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+	{ key empty$
+	    { "to sort, need author, organization, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+	{ key empty$
+	    { "to sort, need editor, organization, or key in " cite$ * warning$
+	      ""
+	    }
+	    { key sortify }
+	  if$
+	}
+	{ "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+{ calc.label
+  sort.label
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+	'editor.organization.sort
+	{ type$ "manual" =
+	    'author.organization.sort
+	    'author.sort
+	  if$
+	}
+      if$
+    }
+  if$
+  *
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.sort.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.sort.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+}
+
+FUNCTION {forward.pass}
+{ last.sort.label sort.label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      sort.label 'last.sort.label :=
+    }
+  if$
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  label extra.label * 'label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {begin.bib}
+{ et.al.char.used
+    { "\newcommand{\etalchar}[1]{$^{#1}$}" write$ newline$ }
+    'skip$
+  if$
+  preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * "}" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/thesis.bib b/thesis.bib
new file mode 100644
index 0000000..e24901a
--- /dev/null
+++ b/thesis.bib
@@ -0,0 +1,568 @@
+
+@InProceedings{	  baronisilvia2004,
+  author	= {Marco Baroni and Silvia Bernardini},
+  title		= {{BootCaT}: Bootstrapping corpora and terms from the web},
+  year		= {2004},
+  pages		= {1313--1316},
+  crossref	= {lrec2004},
+  file		= {bootcat_lrec_2004.pdf:http\://sslmit.unibo.it/~baroni/lrec2004/bootcat_lrec_2004.pdf:PDF}
+		  ,
+  owner		= {egon},
+  timestamp	= {2008.08.17},
+  url		= {http://sslmit.unibo.it/~baroni/publications/lrec2004/bootcat_lrec_2004.pdf}
+		  
+}
+
+@InProceedings{	  baronichantreekilgarriffsharoff2008,
+  author	= {Marco Baroni and Francis Chantree and Kilgarriff and Serge
+		  Sharoff},
+  title		= {{C}lean{E}val: A competition for cleaning Web pages},
+  year		= {2008},
+  crossref	= {lrec2008},
+  file		= {lrec08-cleaneval.pdf:http\://clic.cimec.unitn.it/marco/publications/lrec2008/lrec08-cleaneval.pdf:PDF}
+		  ,
+  owner		= {egon},
+  timestamp	= {2008.08.15},
+  url		= {http://clic.cimec.unitn.it/marco/publications/lrec2008/lrec08-cleaneval.pdf}
+		  
+}
+
+@InProceedings{	  fiasco2007,
+  author	= {Daniel Bauer and Judith Degen and Xiaoye Deng and Priska
+		  Herger and Jan Gasthaus and Eugenie Giesbrecht and Lina
+		  Jansen and Christin Kalina and Thorben Kr\"{u}ger and
+		  Robert M\"{a}rtin and Martin Schmidt and Simon Scholler and
+		  Johannes Steger and Egon Stemle and Stefan Evert},
+  title		= {{FIASCO}: {F}iltering the {I}nternet by {A}utomatic
+		  {S}ubtree {C}lassification, {O}snabr\"{u}ck},
+  year		= {2007},
+  crossref	= {WAC3},
+  owner		= {egon},
+  timestamp	= {2007.11.20},
+  url		= {http://purl.org/stefan.evert/PUB/BauerEtc2007_FIASCO.pdf}
+}
+
+@InProceedings{	  evert2008,
+  author	= {Stefan Evert},
+  title		= {A lightweight and efficient tool for cleaning Web pages},
+  year		= {2008},
+  abstract	= {Originally conceived as a 'naive' baseline experiment
+		  using traditional n-gram language models as classifiers,
+		  the NCLEANER system has turned out to be a fast and
+		  lightweight tool for cleaning Web pages with
+		  state-of-the-art accuracy (based on results from the
+		  CLEANEVAL competition held in 2007). Despite its
+		  simplicity, the algorithm achieves a significant
+		  improvement over the baseline (i.e. plain, uncleaned text
+		  dumps), trading off recall for substantially higher
+		  precision. NCLEANER is available as an open-source software
+		  package. It is preconfigured for English Web pages, but can
+		  adapted to other languages with minimal amounts of manually
+		  cleaned training data. Since NCLEANER does not make use of
+		  HTML structure, it can also be applied to existing Web
+		  corpora that are only available in plain text format, with
+		  a minor loss in classification accuracy.},
+  crossref	= {lrec2008},
+  file		= {Evert2008_NCleaner.pdf:http\://purl.org/stefan.evert/PUB/Evert2008_NCleaner.pdf:PDF}
+		  ,
+  owner		= {egon},
+  timestamp	= {2008.08.15},
+  url		= {http://purl.org/stefan.evert/PUB/Evert2008_NCleaner.pdf}
+}
+
+@InProceedings{	  spoustamarekpecina2008,
+  author	= {Miroslav Spousta and Michal Marek and Pavel Pecina},
+  title		= {Victor: the Web-Page Cleaning Tool},
+  booktitle	= {Proceedings of the 4th Web as Corpus Workshop ({WAC}4) --
+		  Can we beat {G}oogle?},
+  year		= {2008},
+  crossref	= {WAC4},
+  owner		= {egon},
+  timestamp	= {2009.04.02}
+}
+
+@WEBPAGE{	  debian.org,
+  title		= {Debian {GNU}/{L}inux: The free Operating System for your
+		  Computer},
+  url		= {http://www.debian.org/},
+  lastchecked	= {032009},
+  key		= {{@DEB}},
+  crossref	= {gnu.org},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@WEBPAGE{	  firefox,
+  title		= {Firefox: The Browser That Has It All},
+  url		= {http://www.mozilla.com/firefox/},
+  lastchecked	= {032009},
+  key		= {{@FF}},
+  crossref	= {mozilla.org},
+  owner		= {egon},
+  timestamp	= {2008.08.17}
+}
+
+@Article{	  artsteinpoesio2008,
+  author	= {Artstein, Ron and Poesio, Massimo},
+  title		= {Inter-Coder Agreement for Computational Linguistics},
+  journal	= {Computational Linguistics},
+  year		= {2008},
+  volume	= {34},
+  pages		= {555-596},
+  number	= {4},
+  abstract	= {This article is a survey of methods for measuring
+		  agreement among corpus annotators. It exposes the
+		  mathematics and underlying assumptions of agreement
+		  coefficients, covering Krippendorff's alpha as well as
+		  Scott's pi and Cohen's kappa; discusses the use of
+		  coefficients in several annotation tasks; and argues that
+		  weighted, alpha-like coefficients, traditionally less used
+		  than kappa-like measures in computational linguistics, may
+		  be more appropriate for many corpus annotation tasks -- but
+		  that their use makes the interpretation of the value of the
+		  coefficient even harder.},
+  doi		= {10.1162/coli.07-034-R2},
+  eprint	= {http://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-034-R2}
+		  ,
+  owner		= {egon},
+  timestamp	= {2009.04.05},
+  url		= {http://www.mitpressjournals.org/doi/abs/10.1162/coli.07-034-R2}
+		  
+}
+
+@WEBPAGE{	  cleaneval/annotation_guidelines,
+  title		= {{CLEANEVAL}: Guidelines for annotators},
+  url		= {http://cleaneval.sigwac.org.uk/annotation_guidelines.html}
+		  ,
+  author	= {Marco Baroni and Serge Sharoff},
+  lastchecked	= {032009},
+  key		= {{@CEannos}},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@WEBPAGE{	  wwwoffle,
+  title		= {A simple proxy server with special features for use with
+		  dial-up Internet Links},
+  url		= {http://gedanken.demon.co.uk/wwwoffle/},
+  author	= {Andrew M. Bishop},
+  lastchecked	= {032009},
+  key		= {{@WOFFL}},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@InProceedings{	  grefenstettenioche2000,
+  author	= {Gregory Grefenstette and Julien Nioche},
+  title		= {Estimation of English and non-English language use on the
+		  {WWW}},
+  booktitle	= {In Recherche d'Information Assist\'{e}e par Ordinateur
+		  ({RIAO})},
+  year		= {2000},
+  pages		= {237--246},
+  owner		= {egon},
+  timestamp	= {2009.03.27}
+}
+
+@WEBPAGE{	  url,
+  title		= {Uniform Resource Locators: A Syntax for the Expression of
+		  Access Information of Objects on the Network},
+  url		= {http://www.w3.org/Addressing/URL/url-spec.txt},
+  author	= {{URI} working Group},
+  lastchecked	= {032009},
+  key		= {{@URL}},
+  comment	= {cf. w3.org/Addressing},
+  organization	= {W3C},
+  owner		= {egon},
+  timestamp	= {2009.03.19}
+}
+
+@Article{	  kilgarriff2007,
+  author	= {Kilgarriff,, Adam},
+  title		= {Googleology is Bad Science},
+  journal	= {Comput. Linguist.},
+  year		= {2007},
+  volume	= {33},
+  pages		= {147--151},
+  number	= {1},
+  address	= {Cambridge, MA, USA},
+  doi		= {http://dx.doi.org/10.1162/coli.2007.33.1.147},
+  issn		= {0891-2017},
+  owner		= {egon},
+  publisher	= {MIT Press},
+  timestamp	= {2009.03.27}
+}
+
+@Article{	  kilgarriffgrefenstette03,
+  author	= {Adam Kilgarriff and Gregory Grefenstette},
+  title		= {Introduction to the special issue on the web as corpus},
+  journal	= {Computational Linguistics},
+  year		= {2003},
+  volume	= {29},
+  pages		= {333--347},
+  owner		= {egon},
+  timestamp	= {2009.03.21}
+}
+
+@Manual{	  r-project,
+  title		= {R: A Language and Environment for Statistical Computing},
+  author	= {{R Development Core Team}},
+  organization	= {R Foundation for Statistical Computing},
+  address	= {Vienna, Austria},
+  year		= {2008},
+  note		= {{ISBN} 3-900051-07-0},
+  owner		= {egon},
+  timestamp	= {2009.04.05},
+  url		= {http://www.R-project.org}
+}
+
+@Article{	  vips,
+  author	= {Song Ruihua and Liu Haifeng and Wen Ji-Rong and Ma
+		  Wei-Ying},
+  title		= {Learning important models for web page blocks based on
+		  layout and content analysis},
+  journal	= {SIGKDD Explor. Newsl.},
+  year		= {2004},
+  volume	= {6},
+  pages		= {14--23},
+  number	= {2},
+  address	= {New York, NY, USA},
+  doi		= {http://doi.acm.org/10.1145/1046456.1046459},
+  issn		= {1931-0145},
+  owner		= {egon},
+  publisher	= {ACM},
+  timestamp	= {2009.04.06}
+}
+
+@WEBPAGE{	  krdwrd.org,
+  title		= {The {K}rd{W}rd Project Web Site},
+  url		= {https://krdwrd.org/},
+  author	= {Johannes M. Steger and Egon W. Stemle},
+  lastchecked	= {032009},
+  key		= {{@KRDWRD}},
+  owner		= {egon},
+  timestamp	= {2008.08.15}
+}
+
+@WEBPAGE{	  w3.org/addressing,
+  title		= {Naming and Addressing: {URIs}, {URLs}, \ldots},
+  url		= {http://www.w3.org/Addressing/},
+  author	= {W3C},
+  lastchecked	= {032009},
+  key		= {{@ADDRESSING}},
+  organization	= {W3C},
+  owner		= {egon},
+  timestamp	= {2009.03.19}
+}
+
+@Proceedings{	  lrec2004,
+  title		= {Proceedings of the 4th International Conference on
+		  Language Resources and Evaluation ({LREC} 2004)},
+  year		= {2004},
+  editor	= {European Language Resources Association (ELRA)},
+  address	= {Lisbon, Portugal},
+  month		= may,
+  owner		= {egon},
+  timestamp	= {2008.08.17},
+  url		= {http://www.lrec-conf.org/lrec2004/}
+}
+
+@Proceedings{	  lrec2008,
+  title		= {Proceedings of the 6th International Conference on
+		  Language Resources and Evaluation ({LREC} 2008)},
+  year		= {2008},
+  editor	= {European Language Resources Association ({ELRA})},
+  address	= {Marrakech, Morocco},
+  month		= may,
+  owner		= {egon},
+  timestamp	= {2008.08.17},
+  url		= {http://www.lrec-conf.org/lrec2008/}
+}
+
+@Proceedings{	  corpuslinguistics2005,
+  title		= {Proceedings of Corpus Linguistics 2005},
+  year		= {2005},
+  editor	= {Pernilla Danielsson and Martijn Wagenmakers},
+  volume	= {1},
+  series	= {The Corpus Linguistics Conference Series},
+  note		= {ISSN 1747-9398},
+  owner		= {egon},
+  timestamp	= {2009.03.27}
+}
+
+@Proceedings{	  wac4,
+  title		= {Can we beat Google? ({WAC}4 - 2008) -- Proceedings of the
+		  4th web as corpus workshop},
+  year		= {2008},
+  editor	= {Stefan Evert and Adam Kilgarriff and Serge Sharoff},
+  month		= {06},
+  owner		= {egon},
+  timestamp	= {2009.03.23},
+  url		= {http://webascorpus.sourceforge.net/download/WAC4_2008_Proceedings.pdf}
+		  
+}
+
+@Proceedings{	  wac3,
+  title		= {Building and Exploring Web Corpora ({WAC}3 - 2007) --
+		  Proceedings of the 3rd web as corpus workshop,
+		  incorporating {CLEANEVAL}},
+  year		= {2007},
+  editor	= {C\'{e}drick Fairon and Hubert Naets and Adam Kilgarriff
+		  and Gilles-Maurice de Schryver},
+  address	= {Louvain-la-Neuve},
+  publisher	= {Presses universitaires de Louvain},
+  month		= jul,
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@Proceedings{	  wac2,
+  title		= {11th Conference of the European Chapter of the Association
+		  for Computational Linguistics -- Proceedings of the 2nd
+		  International Workshop on Web as Corpus},
+  year		= {2006},
+  editor	= {Adam Kilgarriff and Marco Baroni},
+  address	= {Trento, Italy},
+  month		= apr,
+  owner		= {egon},
+  timestamp	= {2009.03.21},
+  url		= {http://www.aclweb.org/anthology-new/W/W06/W06-1700.pdf}
+}
+
+@Booklet{	  bnc/userlicence,
+  title		= {The {B}ritish {N}ational {C}orpus ({BNC}) User Licence},
+  howpublished	= {Online Version},
+  lastchecked	= {032009},
+  organization	= {University of Oxford},
+  owner		= {egon},
+  timestamp	= {2009.03.27},
+  url		= {http://www.natcorp.ox.ac.uk/docs/licence.pdf}
+}
+
+@WEBPAGE{	  cgi,
+  title		= {The {C}ommon {G}ateway {I}nterface ({CGI}) -- a standard
+		  for external gateway programs to interface with information
+		  servers such as HTTP servers},
+  url		= {http://hoohoo.ncsa.uiuc.edu/cgi/overview.html},
+  lastchecked	= {032009},
+  key		= {{@CGI}},
+  owner		= {egon},
+  timestamp	= {2009.03.21}
+}
+
+@WEBPAGE{	  gnu.org,
+  title		= {{GNU} operating system},
+  url		= {http://www.gnu.org/},
+  lastchecked	= {032009},
+  key		= {{@GNU}},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@WEBPAGE{	  httpd.apache.org,
+  title		= {The Apache {HTTP} Server Project},
+  url		= {http://httpd.apache.org/},
+  lastchecked	= {032009},
+  key		= {{@HTTP}},
+  organization	= {The Apache Software Foundation},
+  owner		= {egon},
+  timestamp	= {2009.03.21}
+}
+
+@WEBPAGE{	  ikw/cl,
+  title		= {The Computational Linguistics Group of the {I}nstitute of
+		  {C}ognitive {S}cience at the {U}niversity of
+		  {O}snabr\"{u}ck},
+  url		= {http://www.ikw.uni-osnabrueck.de/CL/},
+  lastchecked	= {032009},
+  key		= {{@CL}},
+  owner		= {egon},
+  timestamp	= {2009.03.22}
+}
+
+@Manual{	  krdwrd.org/manual,
+  title		= {Add-on Manual},
+  organization	= {The KrdWrd Project},
+  note		= {Online Version at \url{https://krdwrd.org/manual/html/}},
+  owner		= {egon},
+  timestamp	= {2009.04.04},
+  url		= {https://krdwrd.org/manual/manual.pdf}
+}
+
+@WEBPAGE{	  mozilla.org,
+  title		= {The Mozilla project},
+  url		= {http://www.mozilla.org/},
+  lastchecked	= {032009},
+  key		= {{@MOZILLA}},
+  organization	= {Mozilla Corporation},
+  owner		= {egon},
+  timestamp	= {2008.08.17}
+}
+
+@WEBPAGE{	  potamodule,
+  title		= {A Perl module intended to perform ``boilerplate''
+		  stripping and other forms of filtering},
+  url		= {http://sslmitdev-online.sslmit.unibo.it/wac/post_processing.php}
+		  ,
+  lastchecked	= {032009},
+  key		= {{@POTA}},
+  organization	= {SSLMIT Dev, University of Bologna},
+  owner		= {egon},
+  timestamp	= {2009.03.23}
+}
+
+@WEBPAGE{	  python,
+  title		= {Python: An interpreted, interactive, object-oriented
+		  programming language},
+  url		= {http://www.python.org/},
+  lastchecked	= {032009},
+  key		= {{@PYTHON}},
+  owner		= {egon}
+}
+
+@WEBPAGE{	  subversion,
+  title		= {An open source version control system},
+  url		= {http://subversion.tigris.org/},
+  lastchecked	= {032009},
+  key		= {{@SVN}},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@WEBPAGE{	  sz-archiv.de/agb,
+  title		= {S\"{u}ddeutsche {Z}eitung {A}rchiv -- {A}llgemeine
+		  {G}esch\"{a}ftsbedingungen},
+  url		= {http://www.sz-archiv.de/sueddeutsche-zeitung-archiv/onlinearchive/sz-aboarchiv-ubersicht/sz-aboarchiv-agb}
+		  ,
+  lastchecked	= {032009},
+  key		= {{@SZAGB}},
+  organization	= {DIZ München GmbH},
+  owner		= {egon},
+  timestamp	= {2009.03.27}
+}
+
+@WEBPAGE{	  trac,
+  title		= {An enhanced Wiki and issue tracking system for software
+		  development projects},
+  url		= {http://trac.edgewall.org/},
+  lastchecked	= {032009},
+  key		= {{@TRAC}},
+  owner		= {egon},
+  timestamp	= {2008.08.16}
+}
+
+@WEBPAGE{	  unicode.org,
+  title		= {Unicode Home Page},
+  url		= {http://www.unicode.org/},
+  lastchecked	= {032009},
+  key		= {{@UNICODE}},
+  organization	= {Unicode, Inc.},
+  owner		= {egon}
+}
+
+@WEBPAGE{	  w3.org/base,
+  title		= {{HTML} 4.01 Specification -- Path information: the {BASE}
+		  element},
+  url		= {http://www.w3.org/TR/html401/struct/links.html#h-12.4},
+  lastchecked	= {032009},
+  key		= {{@W3base}},
+  organization	= {The {W}orld {W}ide {W}eb Consortium ({W3C})},
+  owner		= {egon},
+  timestamp	= {2008.08.17}
+}
+
+@WEBPAGE{	  wc,
+  title		= {The wc Command},
+  url		= {http://www.bellevuelinux.org/wc.html},
+  lastchecked	= {032009},
+  key		= {{@WC}},
+  owner		= {egon},
+  timestamp	= {2009.03.23}
+}
+
+@WEBPAGE{	  yahoo,
+  title		= {The {Yahoo!}~Internet search engine},
+  url		= {http://www.yahoo.com},
+  lastchecked	= {032009},
+  key		= {{@YAHOO}},
+  organization	= {Yahoo!},
+  owner		= {egon},
+  timestamp	= {2009.03.22}
+}
+@inproceedings{Steger08,
+  author = {Johannes Steger and Niklas Wilming and Felix Wolfsteller and Nicolas H\"oning and Peter K\"onig},
+  booktitle = {WAPCV},
+  editor = {Lucas Paletta and John K. Tsotsos},
+  interHash = {704e542c1aaef3839a566c64998ac686},
+  intraHash = {c0494621b8f35d0f701a7c8bcb6847fa},
+  pages = {153-165},
+  publisher = {Springer},
+  series = {Lecture Notes in Computer Science},
+  title = {The JAMF Attention Modelling Framework.},
+  url = {http://dblp.uni-trier.de/db/conf/wapcv/wapcv2008.html#StegerWWHK08},
+  volume = {5395},
+  year = {2008},
+  ee = {http://dx.doi.org/10.1007/978-3-642-00582-4_12},
+  isbn = {978-3-642-00581-7},
+  date = {2009-02-24}
+}
+
+@Manual{libsvm,
+  author ={Chih-Chung Chang and Chih-Jen Lin},
+  title = {{LIBSVM}: a library for support vector machines},
+  year = {2001},
+  note = {Software available at \url{http://www.csie.ntu.edu.tw/~cjlin/libsvm}}
+}
+
+@techreport{dom,
+  title = {Document Object Model (DOM) Level 3 Core Specification},
+  year = {2004},
+  author = {Arnaud Le Hors and Philippe Le H\'{e}garet and Lauren Wood and Gavin Nicol and Jonathan Robie and Mike Champion and Steve Byrne},
+  institution = {W3C},
+  type = {Recommendation}
+}
+@techreport{xul,
+  title = {XML User Interface Language (XUL) 1.0},
+  year = {2001},
+  author = {Ben Goodger and Ian Hickson and David Hyatt and Chris Waterson},
+  editor = {David Hyatt},
+  institution = {Mozilla.org},
+  type = {Recommendation}
+}
+@inproceedings{bootcat,
+author={M. Baroni and S. Bernardini},
+year={2004},
+title={BootCaT: Bootstrapping corpora and terms from the web},
+series = {Proceedings of LREC 2004}
+}
+@comment{
+@TECHREPORT{NajorkHeydon2001,
+  author = {Marc Najork and Allan Heydon},
+  title = {High-performance web crawling},
+  institution = {SRC Research Report 173, Compaq Systems Research},
+  year = {2001},
+  citeseerurl = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.7568},
+  url = {ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/SRC-173.ps.gz}
+}
+
+@INPROCEEDINGS{ShkapenyukSuel2002,
+  author = {Vladislav Shkapenyuk and Torsten Suel},
+  title = {Design and implementation of a high-performance distributed web crawler},
+  booktitle = {In Proc. of the Int. Conf. on Data Engineering},
+  year = {2002},
+  pages = {357--368},
+  citeseerurl = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.13.4762},
+  url = {http://cis.poly.edu/~suel/papers/crawl.ps}
+}
+}
+
+@inproceedings{StegerStemle2009,
+    abstract = {Algorithmic processing of Web content mostly works on textual contents, neglecting visual information. Annotation tools largely share this deficit as well. We specify requirements for an architecture to overcome both problems and propose an implementation, the KrdWrd system. It uses the Gecko rendering engine for both annotation and feature extraction, providing unified data access in every processing step. Stable data storage and collaboration control scripts for group annotations of massive corpora are provided via a Web interface coupled with a HTTP proxy. A modular interface allows for linguistic and visual data feature extractor plugins. The implementation is suitable for many tasks in theWeb as corpus domain and beyond.},
+    address = {Donostia-San Sebastian, Basque Country},
+    author = {Steger, Johannes M. and Stemle, Egon W.},
+    booktitle = {Proceedings of the Fifth Web as Corpus Workshop (WAC5)},
+    file = {:home/egon/Books/mndlyBooks/Steger, Stemle/Proceedings of the Fifth Web as Corpus Workshop (WAC5)/Steger, Stemle - 2009 - {KrdWrd} Architecture for Unified Processing of Web Content.pdf:pdf},
+    title = {{{KrdWrd} Architecture for Unified Processing of Web Content}},
+    year = {2009}
+}
+