This repository has been archived by the owner on Jul 16, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
egon.bib
134 lines (133 loc) · 10.6 KB
/
egon.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
@book{Wacky2006,
editor = {Baroni, Marco and Bernardini, Silvia},
isbn = {88-6027-004-9},
title = {{Wacky! Working papers on the Web as Corpus}},
url = {http://wackybook.sslmit.unibo.it/},
year = {2006}
}
@inproceedings{BaroniSilvia2004,
author = {Baroni, Marco and Bernardini, Silvia},
booktitle = {Proceedings of LREC 2004, Lisbon: ELDA.},
editor = {(ELRA), European Language Resources Association},
file = {:home/egon/Books/mndlyBooks/Baroni, Bernardini/Proceedings of LREC 2004, Lisbon ELDA./Baroni, Bernardini - 2004 - {BootCaT} Bootstrapping corpora and terms from the web.pdf:pdf},
pages = {1313--1316},
title = {{{BootCaT}: Bootstrapping corpora and terms from the web}},
type = {Conference proceedings (article)},
url = {http://sslmit.unibo.it/$\sim$baroni/publications/lrec2004/bootcat_lrec_2004.pdf},
year = {2004}
}
@article{wackycorpora2009,
abstract = {This article introduces ukWaC, deWaC and itWaC, three very large corpora of English, German, and Italian built by web crawling, and describes the methodology and tools used in their construction. The corpora contain more than a billion words each, and are thus among the largest resources for the respective languages. The paper also provides an evaluation of their suitability for linguistic research, focusing on ukWaC and itWaC. A comparison in terms of lexical coverage with existing resources for the languages of interest produces encouraging results. Qualitative evaluation of ukWaC versus the British National Corpus was also conducted, so as to highlight differences in corpus composition (text types and subject matters). The article concludes with practical information about format and availability of corpora and tools.},
author = {Baroni, Marco and Bernardini, Silvia and Ferraresi, Adriano and Zanchetta, Eros},
doi = {10.1007/s10579-009-9081-4},
file = {:home/egon/Books/mndlyBooks/Baroni et al./Language Resources and Evaluation/Baroni et al. - 2009 - The WaCky wide web a collection of very large linguistically processed web-crawled corpora.pdf:pdf},
issn = {1574-020X},
journal = {Language Resources and Evaluation},
keywords = {Humanities,Social Sciences and Law},
month = feb,
number = {3},
pages = {209--226},
publisher = {Springer Netherlands},
title = {{The WaCky wide web: a collection of very large linguistically processed web-crawled corpora}},
url = {http://www.springerlink.com/content/c348pu7321gx5081/},
volume = {43},
year = {2009}
}
@inproceedings{Christ1994,
abstract = {The paper describes the architecture of an integrated and extensible corpus query system developed at the University of Stuttgart and gives examples of some of the modules realized within this architecture. The modules form the core of a corpus workbench. Within the proposed architecture, information required for the evaluation of queries may be derived from different knowledge sources (the corpus text, databases, on-line thesauri) and by different means: either through direct lookup in a database or by calling external tools which may infer the necessary information at the time of query evaluation. The information available and the method of information access can be stated declaratively and individually for each corpus, leading to a flexible, extensible and modular corpus workbench.},
arxivId = {cmp-lg/9408005},
author = {Christ, Oliver},
booktitle = {Papers in Computational Lexicography (COMPLEX '94)},
file = {:home/egon/Books/mndlyBooks/Christ/Papers in Computational Lexicography (COMPLEX '94)/Christ - 1994 - A Modular and Flexible Architecture for an Integrated Corpus Query System.pdf:pdf},
keywords = {Computation and Language},
month = aug,
pages = {22--32},
title = {{A Modular and Flexible Architecture for an Integrated Corpus Query System}},
url = {http://arxiv.org/abs/cmp-lg/9408005},
year = {1994}
}
@article{JpWaC2009,
abstract = {Of all the major world languages, Japanese is lagging behind in terms of publicly accessible and searchable corpora. In this paper we describe the development of JpWaC, a large corpus of 400 million words of Japanese web text, and its encoding for the Sketch Engine. The Sketch Engine is a web-based corpus query tool that supports fast concordancing, grammatical processing, ‘word sketching ’ (one-page summaries of a word’s grammatical and collocational behaviour), a distributional thesaurus, and robot use. We describe the steps taken to gather and process the corpus, and the development of a shallow grammar for Japanese to enable word sketching. We believe that the Japanese web corpus as loaded into the Sketch Engine will be a useful resource for a wide number of Japanese researchers, learners, and NLP developers.},
author = {Erjavec, Irena Srdanovi\'{c} and Erjavec, Toma\v{z} and Kilgarriff, Adam},
file = {:home/egon/Books/mndlyBooks/Erjavec, Erjavec, Kilgarriff/Information and Media Technologies/Erjavec, Erjavec, Kilgarriff - 2008 - A web corpus and word sketches for Japanese.pdf:pdf},
journal = {Information and Media Technologies},
pages = {529--551},
title = {{A web corpus and word sketches for Japanese}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.98.5049},
volume = {3},
year = {2008}
}
@inproceedings{FinnKushmerickSmyth2001,
author = {Finn, A. and Kushmerick, N. and Smyth, B.},
file = {:home/egon/Books/mndlyBooks/Finn, Kushmerick, Smyth/Unknown/Finn, Kushmerick, Smyth - 2001 - Fact or fiction Content classification for digital libraries.pdf:pdf},
title = {{Fact or fiction: Content classification for digital libraries}},
type = {Conference proceedings (article)},
year = {2001}
}
@misc{Grefenstette2000,
abstract = {The World Wide Web has grown so big, in such an anarchic fashion, that it is difficult to describe. One of the evident intrinsic characteristics of the World Wide Web is its multilinguality. Here, we present a technique for estimating the size of a language-specific corpus given the frequency of commonly occurring words in the corpus. We apply this technique to estimating the number of words available through Web browsers for given languages. Comparing data from 1996 to data from 1999 and 2000, we calculate the growth of a number of European languages on the Web. As expected, non-English languages are growing at a faster pace than English, though the position of English is still dominant. Introduction In their attempt to understand the anarchically expanding World Wide Web, researchers have been trying to estimate a variety of Web characteristics (Bray 96},
author = {Grefenstette, Gregory and Nioche, Julien},
file = {:home/egon/Books/mndlyBooks/Grefenstette, Nioche/Unknown/Grefenstette, Nioche - 2000 - Estimation of English and non-English Language Use on the WWW.pdf:pdf},
pages = {237--246},
publisher = {In Recherche d’Information Assist\'{e}e par Ordinateur (RIAO},
title = {{Estimation of English and non-English Language Use on the WWW}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.2.2996},
year = {2000}
}
@inproceedings{NoWaC2010,
author = {Guevara, Emiliano},
booktitle = {Proceedings of the Sixth Web as Corpus Workshop (WAC6)},
file = {:home/egon/Books/mndlyBooks/Guevara/Proceedings of the Sixth Web as Corpus Workshop (WAC6)/Guevara - 2010 - NoWaC a large web-based corpus for Norwegian.pdf:pdf},
month = jun,
pages = {1--7},
publisher = {The Association for Computational Linguistics},
title = {{NoWaC: a large web-based corpus for Norwegian}},
url = {http://portal.acm.org/citation.cfm?id=1868765.1868766},
year = {2010}
}
@misc{Jones2000,
abstract = {We present an approach to language-specific query-based sampling which, given a single document in a target language, can find many more examples of documents in that language, by automatically constructing queries to access such documents on the world wide web. We propose a number of methods for building search queries to quickly obtain documents in the target language. They perform accurately and efficiently for building a corpus of documents in Tagalog starting from a single seed document, when these documents are only 2.5% of the documents in a collection. We found that sampling with a query consisting of a word seleccted according to its probability from the minority language corpus constructed so far was very successful. This method built a corpus of documents with word frequencies similar to those in the corpus based on all Tagalog documents in our collection, and required a relatively small number of search queries. It also quickly acquired a good c...},
author = {Jones, Rosie and Ghani, Rayid},
file = {:home/egon/Books/mndlyBooks/Jones, Ghani/Unknown/Jones, Ghani - 2000 - Automatically Building a Corpus for a Minority Language from the Web.pdf:pdf},
pages = {29--36},
publisher = {Proceedings of the Student Research Workshop at the 38th Annual Meeting of the Association for Computational Linguistics},
title = {{Automatically Building a Corpus for a Minority Language from the Web}},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.3825},
year = {2000}
}
@article{Kilgarriff2006,
abstract = {In a 12-month project we have developed a new, register-diverse, 55-million-word bilingual corpus—the New Corpus for Ireland (NCI)—to support the creation of a new English-to-Irish dictionary. The paper describes the strategies we employed, and the solutions to problems encountered. We believe we have a good model for corpus creation for lexicography, and others may find it useful as a blueprint. The corpus has two parts, one Irish, the other Hiberno-English (English as spoken in Ireland). We describe its design, collection and encoding.},
annote = {2 Refs:
GrefenstetteN.2000
JonesG.2000
- on copyrighted material when using web as corpus
- what's in a web corpus},
author = {Kilgarriff, Adam and Rundell, Michael and {U\'{\i} Dhonnchadha}, Elaine},
doi = {10.1007/s10579-006-9011-7},
file = {:home/egon/Books/mndlyBooks/Kilgarriff, Rundell, U\'{\i} Dhonnchadha/Language Resources and Evaluation/Kilgarriff, Rundell, U\'{\i} Dhonnchadha - 2006 - Efficient corpus development for lexicography building the New Corpus for Ireland.pdf:pdf},
issn = {1574-020X},
journal = {Language Resources and Evaluation},
keywords = {Humanities,Social Sciences and Law},
month = dec,
number = {2},
pages = {127--152},
publisher = {Springer Netherlands},
title = {{Efficient corpus development for lexicography: building the New Corpus for Ireland}},
url = {http://www.springerlink.com/content/9737k33158jtw084/},
volume = {40},
year = {2006}
}
@misc{cwb,
author = {Web},
title = {{The {IMS} Open {C}orpus {W}orkbench ({CWB})}},
type = {Electronic citation},
url = {http://cwb.sourceforge.net/},
year = {2008}
}
@misc{yahoo,
author = {Yahoo!Inc.},
title = {{The {Yahoo!} Internet search engine}},
type = {Electronic citation},
url = {http://www.yahoo.com},
year = {1995}
}