Automatically assigned DDC number: 006312
Manually assigned DDC number: 006312
Number of references: 0
Title: Efficient Text Categorization
Author:
Subject: Marko Grobelnik Efficient Text Categorization
Description: We present an approach to text categorization using machine learning techniques. The approach is developed and tested on large text hierarchy named Yahoo that is available on the Web. We handle the large number of features and training examples by taking into account hierarchical structure of examples and using feature subset selection for large text data. The large number of categories is handled separately for each testing example by pruning unpromising categories. In this way, the number of categories to be considered is cut to less than a half without degrading the system performance. Our experiments are performed using naive Bayesian classifier on text data using feature-vector document representation that includes n-grams instead of just single words (unigrams). Experimental evaluation on three domains constructed from Yahoo hierarchy shows that among several hundred categories the correct category is assigned probability over 0.99 when rather small number of features used. 1 Int...
Contributor: The Pennsylvania State University CiteSeer Archives
Publisher: unknown
Date: 1998-03-04
Pubyear: 1998
Format: ps
Identifier: http://citeseer.ist.psu.edu/140838.html
Source: http://www.cs.cmu.edu/~TextLearning/pww/papers/PWW/pwwWshECML98.ps.gz
Language: en
Rights: unrestricted
<?xml version="1.0" encoding="UTF-8"?>
<references_metadata>
<rec ID="SELF" Type="SELF" CiteSeer_Book="SELF" CiteSeer_Volume="SELF" Title="Efficient Text Categorization">
<identifier Org="ISBN:1402040423" Paper_ID="SELF" Extracted="1402040423" DDC="620.8/2" Normalized_DDC="62082" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:1581131461" Paper_ID="SELF" Extracted="1581131461" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:1581137230" Paper_ID="SELF" Extracted="1581137230" />
<identifier Org="ISBN:1586037749" Paper_ID="SELF" Extracted="1586037749" DDC="610.285" Normalized_DDC="610285" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:1591400511" Paper_ID="SELF" Extracted="1591400511" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:1853128066" Paper_ID="SELF" Extracted="1853128066" DDC="006.3/12" Normalized_DDC="006312" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540201807" Paper_ID="SELF" Extracted="3540201807" DDC="004.67/8" Normalized_DDC="004678" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540209719" Paper_ID="SELF" Extracted="3540209719" DDC="025.04" Normalized_DDC="02504" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540232583" Paper_ID="SELF" Extracted="3540232583" DDC="005.74" Normalized_DDC="00574" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540283129" Paper_ID="SELF" Extracted="3540283129" DDC="006.33" Normalized_DDC="00633" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540334726" Paper_ID="SELF" Extracted="3540334726" DDC="025.04" Normalized_DDC="02504" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540408134" Paper_ID="SELF" Extracted="3540408134" DDC="519.5" Normalized_DDC="5195" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:354041066X" Paper_ID="SELF" Extracted="354041066X" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540437819" Paper_ID="SELF" Extracted="3540437819" DDC="670/.285/63" Normalized_DDC="67028563" Normalized_Weight="0.07142857142857142" />
<identifier Org="ISBN:3540440380" Paper_ID="SELF" Extracted="3540440380" DDC="006.3" Normalized_DDC="0063" Normalized_Weight="0.07142857142857142" />
</rec>
</references_metadata>