@InProceedings{ GatterbauerBHKP2007:Towards,
  author =      {Wolfgang Gatterbauer and Paul Bohunsky and Marcus Herzog and Bernhard Kr\"{u}pl and Bernhard Pollak},
  title =       {Towards Domain-Independent Information Extraction from Web Tables},
  year =        {2007},
  month = may # {~8--12,},
  booktitle =   {Proceedings of the 16th International {W}orld {W}ide {W}eb Conference ({WWW}~2007)},
  pages =       {71--80},
  location =    {Banff, Alberta, Canada},
  publisher =   {ACM Press},
  isbn =        {},
  doi =         {},
  url =         {http://www2007.org/paper790.php},
  pdf =         {http://www2007.org/papers/paper790.pdf},
  abstract =    {Traditionally, information extraction from web tables has
                 focused on small, more or less homogeneous corpora, often
                 based on assumptions about the use of <table> tags. A
                 multitude of different HTML implementations of web tables
                 make these approaches difficult to scale. In this paper, we
                 approach the problem of domain-independent information
                 extraction from web tables by shifting our attention from the
                 tree-based representation of web pages to a variation of the
                 two-dimensional visual box model used by web browsers to
                 display the information on the screen. The thereby obtained
                 topological and style information allows us to fill the gap
                 created by missing domain-specific knowledge about content
                 and table templates. We believe that, in a future step, this
                 approach can become the basis for a new way of large-scale
                 knowledge acquisition from the current “Visual Web.”},
  keyword =     {Information extraction, Web mining, Web tables, Web page representation, Visual analysis},
}