@InProceedings{ KruplHG2005:Using, author = {Bernhard Kr{\"u}pl and Marcus Herzog and Wolfgang Gatterbauer}, title = {Using Visual Cues for Extraction of Tabular Data from Arbitrary {HTML} Documents}, year = {2005}, month = may # {~10--14,}, booktitle = {Proceedings of special interest tracks and posters of the 14th international conference on {W}orld {W}ide {W}eb ({WWW}~2005)}, pages = {1000--1001}, location = {Chiba, Japan}, publisher = {ACM Press}, ISBN = {1-59593-051-5}, doi = {http://doi.acm.org/10.1145/1062745.1062838}, url = {http://www2006.org/programme/files/xhtml/p154/pp154-kruepl.html}, pdf = {http://www2005.org/cdrom/docs/p1000.pdf}, pdf2 = {http://www.dbai.tuwien.ac.at/staff/gatter/work/WWW_2005_Visual_Information_Extraction.pdf}, abstract = {We describe a method to extract tabular data from web pages. Rather than just analyzing the DOM tree, we also exploit visual cues in the rendered version of the document to extract data from tables which are not explicitly marked with an HTML element. To detect tables, we rely on a variant of the well-known X-Y cut algorithm as used in the OCR community. We implemented the system by directly accessing Mozilla's box model that contains the positional data for all HTML elements of a given web page.}, keyword = {table detection, visual analysis, web information extraction}, }