@InProceedings{ GatterbauerB2006:TableExtraction, author = {Wolfgang Gatterbauer and Paul Bohunsky}, title = {Table Extraction Using Spatial Reasoning on the {CSS2} Visual Box Model}, year = {2006}, month = jul # {~16--20,}, booktitle = {Proceedings of the 21st National Conference on Artificial Intelligence ({AAAI}~2006)}, pages = {1313--1318}, location = {Boston, MA, USA}, publisher = {AAAI, MIT Press}, pdf = {http://www.dbai.tuwien.ac.at/staff/gatter/work/AAAI_2006_Table_Extraction_Spatial_Reasoning.pdf}, abstract = {Tables on web pages contain a huge amount of semantically explicit information, which makes them a worthwhile target for automatic information extraction and knowledge acquisition from the Web. However, the task of table extraction from web pages is difficult, because of HTML’s design purpose to convey visual instead of semantic information. In this paper, we propose a robust technique for table extraction from arbitrary web pages. This technique relies upon the positional information of visualized DOM element nodes in a browser and, hereby, separates the intricacies of code implementation from the actual intended visual appearance. The novel aspect of the proposed web table extraction technique is the effective use of spatial reasoning on the CSS2 visual box model, which shows a high level of robustness even without any form of learning (F-measure = 90%). We describe the ideas behind our approach, the tabular pattern recognition algorithm operating on a double topographical grid structure and allowing for effective and robust extraction, and general observations on web tables that should be borne in mind by any automatic web table extraction mechanism.}, keyword = {table extraction, table recognition, table understanding, spatial reasoning, knowledge acquisition from the Web, information extraction on the Web}, }