Automatically assigned DDC number: 006312
Manually assigned DDC number: 006312
Number of references: 8
Title: Exploiting Structural Information for Text Classification on the WWW
Author:
Subject: Johannes Furnkranz Exploiting Structural Information for Text Classification on the WWW
Description: . In this paper, we report on a set of experiments that explore the utility of making use of the structural information of WWW documents. Our working hypothesis is that it is often easier to classify a hypertext page using information provided on pages that point to it instead of using information that is provided on the page itself. We present experimental evidence that confirms this hypothesis on a set of Web-pages that relate to Computer Science Departments. 1 Introduction The advent of the World-Wide Web has rejuvinated the interest in text categorization problems. Vast amounts of documents are available on-line, and categorizing them into meaningful semantic categories is a rewarding and challenging research problem. However, current approaches to text categorization on the Web mostly concentrate on simple representation schemes that are based on word occurrence and word frequency. The structural information that is inherent to documents on the Web is often neglected. There are a...
Contributor: The Pennsylvania State University CiteSeer Archives
Publisher: unknown
Date: 1999-05-05
Pubyear: 1999
Format: ps
Identifier: http://citeseer.ist.psu.edu/173894.html
Source: http://www.ai.univie.ac.at/~juffi/publications/ida-99.ps.gz
Language: en
Relation:
Relation:
Relation:
Relation:
Relation:
Relation:
Relation:
Relation:
Rights: unrestricted
<?xml version="1.0" encoding="UTF-8"?>
<references_metadata>
<rec ID="/54411.html" Type="inproceedings" CiteSeer_Book="Proc of the 12th International Conference on Machine Learning" CiteSeer_Volume="" Title="Fast Effective Rule Induction,">
<identifier Org="ISBN:0387333339" Paper_ID="/54411.html" Extracted="0387333339" DDC="006.312" Normalized_DDC="006312" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:0387699341" Paper_ID="/54411.html" Extracted="0387699341" DDC="006.312" Normalized_DDC="006312" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:0898715458" Paper_ID="/54411.html" Extracted="0898715458" DDC="006.3/12" Normalized_DDC="006312" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:0898715938" Paper_ID="/54411.html" Extracted="0898715938" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:1558603778" Paper_ID="/54411.html" Extracted="1558603778" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:1591404509" Paper_ID="/54411.html" Extracted="1591404509" DDC="006.3/3" Normalized_DDC="00633" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:1605660108" Paper_ID="/54411.html" Extracted="1605660108" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540222189" Paper_ID="/54411.html" Extracted="3540222189" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540231056" Paper_ID="/54411.html" Extracted="3540231056" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540236627" Paper_ID="/54411.html" Extracted="3540236627" DDC="005.75/8" Normalized_DDC="005758" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540265430" Paper_ID="/54411.html" Extracted="3540265430" DDC="006.3/12" Normalized_DDC="006312" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540287957" Paper_ID="/54411.html" Extracted="3540287957" DDC="519.5" Normalized_DDC="5195" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540403000" Paper_ID="/54411.html" Extracted="3540403000" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540408134" Paper_ID="/54411.html" Extracted="3540408134" DDC="519.5" Normalized_DDC="5195" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540425381" Paper_ID="/54411.html" Extracted="3540425381" DDC="005.1/15" Normalized_DDC="005115" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540430601" Paper_ID="/54411.html" Extracted="3540430601" DDC="519.5" Normalized_DDC="5195" Normalized_Weight="0.058823529411764705" />
<identifier Org="ISBN:3540681248" Paper_ID="/54411.html" Extracted="3540681248" />
<identifier Org="ISBN:3540695729" Paper_ID="/54411.html" Extracted="3540695729" />
<identifier Org="ISBN:3540753893" Paper_ID="/54411.html" Extracted="3540753893" DDC="006.31" Normalized_DDC="00631" Normalized_Weight="0.058823529411764705" />
</rec>
<rec ID="/56507.html" Type="inproceedings" CiteSeer_Book="AAAIIAAI Vol 1" CiteSeer_Volume="" Title="Learning Trees and Rules with Set-Valued Features,">
<identifier Org="ISBN:0124438814" Paper_ID="/56507.html" Extracted="0124438814" DDC="006.3/3" Normalized_DDC="00633" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:0262510987" Paper_ID="/56507.html" Extracted="0262510987" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:0769510469" Paper_ID="/56507.html" Extracted="0769510469" />
<identifier Org="ISBN:1558607072" Paper_ID="/56507.html" Extracted="1558607072" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:1558608699" Paper_ID="/56507.html" Extracted="1558608699" />
<identifier Org="ISBN:1581137230" Paper_ID="/56507.html" Extracted="1581137230" />
<identifier Org="ISBN:3540263195" Paper_ID="/56507.html" Extracted="3540263195" DDC="006.32" Normalized_DDC="00632" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540281770" Paper_ID="/56507.html" Extracted="3540281770" DDC="005.1/15" Normalized_DDC="005115" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540292306" Paper_ID="/56507.html" Extracted="3540292306" DDC="501" Normalized_DDC="501" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540298495" Paper_ID="/56507.html" Extracted="3540298495" DDC="621.392" Normalized_DDC="621392" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540404333" Paper_ID="/56507.html" Extracted="3540404333" DDC="006.3/33" Normalized_DDC="006333" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540418261" Paper_ID="/56507.html" Extracted="3540418261" DDC="025.04" Normalized_DDC="02504" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540420274" Paper_ID="/56507.html" Extracted="3540420274" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540633464" Paper_ID="/56507.html" Extracted="3540633464" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540647384" Paper_ID="/56507.html" Extracted="3540647384" DDC="005.1/15" Normalized_DDC="005115" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540663320" Paper_ID="/56507.html" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540672273" Paper_ID="/56507.html" Extracted="3540672273" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540717021" Paper_ID="/56507.html" Extracted="3540717021" />
<identifier Org="ISBN:9042012579" Paper_ID="/56507.html" Extracted="9042012579" />
</rec>
<rec ID="/90349.html" Type="misc" CiteSeer_Book="" CiteSeer_Volume="" Title="Using statistical and relational methods to characterize hyperlink paths,">
<identifier Org="ISBN:3540663320" Paper_ID="/90349.html" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="1.0" />
</rec>
<rec ID="/124233.html" Type="inproceedings" CiteSeer_Book="Proceedings of AAAI98 15th Conference of the American Association for Artificial Intelligence" CiteSeer_Volume="" Title="Learning to extract symbolic knowledge from the {W}orld {W}ide {W}eb," />
<rec ID="/156523.html" Type="article" CiteSeer_Book="The AI Magazine" CiteSeer_Volume="18" Title="Machine-Learning Research: Four Current Directions,">
<identifier Org="ISBN:0262012111" Paper_ID="/156523.html" Extracted="0262012111" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.05" />
<identifier Org="ISBN:0470116625" Paper_ID="/156523.html" Extracted="0470116625" DDC="572.80285/61" Normalized_DDC="5728028561" Normalized_Weight="0.05" />
<identifier Org="ISBN:1402073887" Paper_ID="/156523.html" Extracted="1402073887" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.05" />
<identifier Org="ISBN:1577350030" Paper_ID="/156523.html" Extracted="1577350030" DDC="658.5" Normalized_DDC="6585" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540205896" Paper_ID="/156523.html" Extracted="3540205896" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540231056" Paper_ID="/156523.html" Extracted="3540231056" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540306765" Paper_ID="/156523.html" Extracted="3540306765" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540366679" Paper_ID="/156523.html" Extracted="3540366679" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540404082" Paper_ID="/156523.html" Extracted="3540404082" DDC="006.3/2" Normalized_DDC="00632" Normalized_Weight="0.05" />
<identifier Org="ISBN:354041066X" Paper_ID="/156523.html" Extracted="354041066X" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540425535" Paper_ID="/156523.html" Extracted="3540425535" DDC="658.4/038/0285574" Normalized_DDC="65840380285574" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540425551" Paper_ID="/156523.html" Extracted="3540425551" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540659072" Paper_ID="/156523.html" Extracted="3540659072" DDC="003/.3" Normalized_DDC="0033" Normalized_Weight="0.05" />
<identifier Org="ISBN:354065965X" Paper_ID="/156523.html" Extracted="354065965X" DDC="006.3/3" Normalized_DDC="00633" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540665587" Paper_ID="/156523.html" Extracted="3540665587" DDC="025/.00285" Normalized_DDC="02500285" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540665994" Paper_ID="/156523.html" Extracted="3540665994" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540679774" Paper_ID="/156523.html" Extracted="3540679774" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540679936" Paper_ID="/156523.html" Extracted="3540679936" DDC="510 s" Normalized_DDC="51" Normalized_Weight="0.05" />
<identifier Org="ISBN:3540728465" Paper_ID="/156523.html" Extracted="3540728465" DDC="006.4/2" Normalized_DDC="00642" Normalized_Weight="0.05" />
<identifier Org="ISBN:9051994753" Paper_ID="/156523.html" Extracted="9051994753" DDC="621.36/7" Normalized_DDC="621367" Normalized_Weight="0.05" />
</rec>
<rec ID="/174218.html" Type="Article" CiteSeer_Book="Machine Learning" CiteSeer_Volume="27" Title="Pruning Algorithms for Rule Learning,">
<identifier Org="ISBN:0769506259" Paper_ID="/174218.html" Extracted="0769506259" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:0898715687" Paper_ID="/174218.html" Extracted="0898715687" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:1402043775" Paper_ID="/174218.html" Extracted="1402043775" />
<identifier Org="ISBN:155860586X" Paper_ID="/174218.html" Extracted="155860586X" />
<identifier Org="ISBN:1558607781" Paper_ID="/174218.html" Extracted="1558607781" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:1586034529" Paper_ID="/174218.html" Extracted="1586034529" />
<identifier Org="ISBN:3540001700" Paper_ID="/174218.html" Extracted="3540001700" DDC="005.1" Normalized_DDC="0051" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540005676" Paper_ID="/174218.html" Extracted="3540005676" DDC="005.1/15" Normalized_DDC="005115" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540219374" Paper_ID="/174218.html" Extracted="3540219374" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540231056" Paper_ID="/174218.html" Extracted="3540231056" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540419101" Paper_ID="/174218.html" Extracted="3540419101" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540628584" Paper_ID="/174218.html" Extracted="3540628584" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540663320" Paper_ID="/174218.html" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:3540714405" Paper_ID="/174218.html" Extracted="3540714405" />
<identifier Org="ISBN:3540859276" Paper_ID="/174218.html" Extracted="3540859276" />
<identifier Org="ISBN:9810246846" Paper_ID="/174218.html" Extracted="9810246846" DDC="621.399" Normalized_DDC="621399" Normalized_Weight="0.08333333333333333" />
<identifier Org="ISBN:9810247532" Paper_ID="/174218.html" Extracted="9810247532" DDC="658.4/033" Normalized_DDC="6584033" Normalized_Weight="0.08333333333333333" />
</rec>
<rec ID="/153148.html" Type="misc" CiteSeer_Book="" CiteSeer_Volume="" Title="Using Links for Classifying Web-pages,">
<identifier Org="ISBN:1581137044" Paper_ID="/153148.html" Extracted="1581137044" DDC="006.7" Normalized_DDC="0067" Normalized_Weight="0.3333333333333333" />
<identifier Org="ISBN:3540408088" Paper_ID="/153148.html" Extracted="3540408088" DDC="381/.142/028558" Normalized_DDC="381142028558" Normalized_Weight="0.3333333333333333" />
<identifier Org="ISBN:3540663320" Paper_ID="/153148.html" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.3333333333333333" />
</rec>
<rec ID="/9707.html" Type="misc" CiteSeer_Book="" CiteSeer_Volume="" Title="A Case Study in Using Linguistic Phrases for Text Categorization on the {WWW},">
<identifier Org="ISBN:0387244352" Paper_ID="/9707.html" Extracted="0387244352" DDC="006.3/12" Normalized_DDC="006312" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:0769507107" Paper_ID="/9707.html" Extracted="0769507107" />
<identifier Org="ISBN:0792373499" Paper_ID="/9707.html" Extracted="0792373499" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:0792376560" Paper_ID="/9707.html" Extracted="0792376560" DDC="005.2/76" Normalized_DDC="005276" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:079237679X" Paper_ID="/9707.html" Extracted="079237679X" DDC="005" Normalized_DDC="005" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:0826491812" Paper_ID="/9707.html" Extracted="0826491812" DDC="418/.020285" Normalized_DDC="418020285" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:158113231X" Paper_ID="/9707.html" Extracted="158113231X" />
<identifier Org="ISBN:1600217001" Paper_ID="/9707.html" Extracted="1600217001" DDC="401/.410285" Normalized_DDC="401410285" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:1845640179" Paper_ID="/9707.html" Extracted="1845640179" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540211233" Paper_ID="/9707.html" Extracted="3540211233" DDC="006.3/2" Normalized_DDC="00632" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540213317" Paper_ID="/9707.html" Extracted="3540213317" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540213821" Paper_ID="/9707.html" Extracted="3540213821" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540425365" Paper_ID="/9707.html" Extracted="3540425365" DDC="006.3/1" Normalized_DDC="00631" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540663320" Paper_ID="/9707.html" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07692307692307693" />
<identifier Org="ISBN:3540673059" Paper_ID="/9707.html" Extracted="3540673059" DDC="006.3/2" Normalized_DDC="00632" Normalized_Weight="0.07692307692307693" />
</rec>
<rec ID="SELF" Type="SELF" CiteSeer_Book="SELF" CiteSeer_Volume="SELF" Title="Exploiting Structural Information for Text Classification on the WWW">
<identifier Org="ISBN:0521836573" Paper_ID="SELF" Extracted="0521836573" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.1" />
<identifier Org="ISBN:076374137X" Paper_ID="SELF" Extracted="076374137X" DDC="006.3/3" Normalized_DDC="00633" Normalized_Weight="0.1" />
<identifier Org="ISBN:1402037678" Paper_ID="SELF" Extracted="1402037678" DDC="025.04" Normalized_DDC="02504" Normalized_Weight="0.1" />
<identifier Org="ISBN:1402081502" Paper_ID="SELF" Extracted="1402081502" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.1" />
<identifier Org="ISBN:1581135939" Paper_ID="SELF" Extracted="1581135939" />
<identifier Org="ISBN:1581137230" Paper_ID="SELF" Extracted="1581137230" />
<identifier Org="ISBN:1595931406" Paper_ID="SELF" Extracted="1595931406" />
<identifier Org="ISBN:3540201777" Paper_ID="SELF" Extracted="3540201777" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.1" />
<identifier Org="ISBN:3540250573" Paper_ID="SELF" Extracted="3540250573" DDC="006.312" Normalized_DDC="006312" Normalized_Weight="0.1" />
<identifier Org="ISBN:3540315888" Paper_ID="SELF" Extracted="3540315888" DDC="006.7" Normalized_DDC="0067" Normalized_Weight="0.1" />
<identifier Org="ISBN:354040726X" Paper_ID="SELF" Extracted="354040726X" DDC="025/.00285" Normalized_DDC="02500285" Normalized_Weight="0.1" />
<identifier Org="ISBN:3540663320" Paper_ID="SELF" Extracted="3540663320" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.1" />
<identifier Org="ISBN:3540691367" Paper_ID="SELF" Extracted="3540691367" DDC="006.33" Normalized_DDC="00633" Normalized_Weight="0.1" />
<identifier Org="ISBN:3540733442" Paper_ID="SELF" Extracted="3540733442" />
</rec>
</references_metadata>