@InProceedings{ PollakG2007:Creating, author = {Bernhard Pollak and Wolfgang Gatterbauer}, title = {Creating Permanent Test Collections of Web Pages for Information Extraction Research}, year = {2007}, month = jan # {~20--26,}, booktitle = {Proceedings of the 33nd International Conference on Current Trends in Theory and Practice of Computer Science ({SOFSEM} 2007)}, volume = {II}, pages = {103--115}, location = {Prague, Czech Republic}, publisher = {ICS AS CR (Institute of Computer Science, Academy of Sciences of the Czech Republic)}, pdf = {http://www.dbai.tuwien.ac.at/staff/gatter/work/SOFSEM_2007_Creating_Information_Extraction_Test_Collections.pdf}, ISBN = {ISBN 80-903298-9-6}, abstract = {In the research area of automatic web information extraction, there is a need for permanent and annotated web page collections enabling objective performance evaluation of different algorithms. Currently, researchers are suffering from the absence of such representative and contemporary test collections, especially on web tables. At the same time, creating your own sharable web page collections is not trivial nowadays because of the dynamic and diverse nature of modern web technologies employed to create often short-lived online content. In this paper, we cover the problem of creating static representations of web pages in order to build sharable ground truth test sets. We explain the principal difficulties of the problem, discuss possible approaches and introduce our solution: WebPageDump, a Firefox extension capable of saving web pages exactly as they are rendered online. Finally, we benchmark our system with current alternatives using an innovative automatic method based on image snapshots.}, keyword = {saving web pages, web information extraction, test data, Firefox, web table ground truth, performance evaluation}, }