/**
 *#########################################################################
 *
 * A component of the Gatherer application, part of the Greenstone digital
 * library suite from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * <BR><BR>
 *
 * Author: John Thompson, Greenstone Digital Library, University of Waikato
 *
 * <BR><BR>
 *
 * Copyright (C) 1999 New Zealand Digital Library Project
 *
 * <BR><BR>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * <BR><BR>
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * <BR><BR>
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *########################################################################
 */
package org.greenstone.gatherer.util;


/** This class functions much like a <strong>StringTokenizer</strong> in that it tokenizes a long string into tokens, however this tokenizer cleverly notices HTML formatting tags. */
public class HTMLStringTokenizer {
    /** The current position in the source string. */
    private int pos = 0;
    /** The current token, usually created by the last nextToken call. */
    private String current = null;
    /** The previous token. */
    private String previous = null;
    /** The string to be tokenized, including any HTML markup. */
    private String source = null;
    /** Constructor.
     * @param source The source <strong>String</strong> to be tokenized.
     */
    public HTMLStringTokenizer(String source) {
	this.source = source;
	// Parse the first token.
	parseToken();
    }

    /** Determines if there are still tokens remaining unparsed in the source.
     * @return A <strong>boolean</strong> which is <i>true</i> if there are more tokens.
     */ 
    public boolean hasMoreTokens() {
	if(current != null && current.length() > 0) {
	    return true;
	}
	return false;
    } 

    /** Determines if the tag currently being returned by sameToken is a tag.
     * @return A <strong>boolean</strong> indicating if the token is a tag.
     */
    public boolean isTag() {
	if(previous.startsWith("<") && previous.endsWith(">")) {
	    return true;
	}
	return false;
    }

    /** Retrieves the next token. 
     * @return A <strong>String</strong> representing the token.
     */
    public String nextToken() {
	previous = current;
	// Get the next token.
	parseToken();
	// Return previous.
	return previous;
    }

    /** Parses the next token and stores it in current.
     */
    private void parseToken() {
	boolean found = false;
	boolean tag = false;
	boolean text = false;
	// Reset current
	current = "";
	// Parse away
	dumpWhiteSpace();
	while(pos < source.length() && !found) {
	    char c = (char)source.charAt(pos);
	    if(!tag && !text) {
		if(c == '<') {
		    tag = true;
		}
		else {
		    text = true;
		}
		current = current + c;
	    }
				// Reading a tag. Watch only for '>'.
	    else if(tag) {
		if(c == '>') {
		    found = true;
		}
		current = current + c;
	    }
				// Reading text. Watch for ' ' and '<'. Rollback '<'.
	    else if(text) {
		if(c == ' ') {
		    found = true;
		}
		else if(c == '<') {
		    found = true;
		    pos--;
		}
		else {
		    current = current + c;
		}
	    }
	    pos++;
	}
    }

    /** Method to ignore whitespace in the source.
     */
    private void dumpWhiteSpace() {
	while(pos < source.length() && source.charAt(pos) == ' ') {
	    pos++;
	}
    }
}
