/*
$Id: $
**
** swish2xml.c - dump a binary swish-e database to XML 
**               mostly compatible with jsFind
** 
** build from within swish-e/src dir
** gcc -o swish2xml -I. -Ireplace -I /usr/include/libxml2/ -L.libs -lswish-e -lxml2 swish2xml.c
** ./swish2xml > swish.xml
**  
** This program and library is free software; you can redistribute it and/or
** modify it under the terms of the GNU (Library) General Public License
** as published by the Free Software Foundation; either version 2
** of the License, or any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU (Library) General Public License for more details.
**
** You should have received a copy of the GNU (Library) General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
**---------------------------------------------------------
*/

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "swish.h"
#include "mem.h"
#include "search.h"
#include "merge.h"
#include "docprop.h"
#include "metanames.h"
#include "db.h"
#include <libxml/parser.h>
#include <libxml/tree.h>

void DB_xml_dump (SWISH * sw, IndexFILE * indexf);

int
main (int argc, char **argv)
{
	SWISH *sw;

	setlocale (LC_CTYPE, "");
    
	SwishErrorsToStderr ();
    
    /* new SWISH handle */
	sw = SwishNew ();

    /* TODO: properly parse command line arguments */
	addindexfile (sw, argv[1] && *(argv[1]) ? argv[1] : INDEXFILE);

    /* Set the default index file */
	if (sw->indexlist == NULL)
		addindexfile (sw, INDEXFILE);

    /* process each index file */
	while (sw->indexlist != NULL)
	{
		DB_xml_dump (sw, sw->indexlist);
		putchar ('\n');

		sw->indexlist = sw->indexlist->next;
	}

    /* close SWISH handle */
	SwishClose (sw);
	exit (0);
}


/* Print out the data in an index DB */
void
DB_xml_dump (SWISH * sw, IndexFILE * indexf)
{
#ifdef LIBXML_TREE_ENABLED
	int i, j, c, fieldnum, frequency, metaID, tmpval, printedword, filenum;
	unsigned int *posdata;
	int metadata_length;
	char word[2];
	char *resultword;
	unsigned char *worddata, *s, *start, flag;
	int sz_worddata, saved_bytes;
	sw_off_t wordID;
	int *meta_used;
	int end_meta = 0;
    xmlDocPtr doc = NULL;       /* document pointer */
    xmlNodePtr swishe_node = NULL, node = NULL, node1 = NULL; /* node pointers */
    xmlDtdPtr dtd = NULL;       /* DTD pointer */
    char xmlbuff[MAXSTRLEN + 1];
    
    LIBXML_TEST_VERSION;
    
	metaID = 0;
	metadata_length = 0;
	c = 0;
	frequency = 0;

    /* Open Database */
	indexf->DB = DB_Open (sw, indexf->line, DB_READ);
	if (sw->lasterror)
		SwishAbortLastError (sw);

    /* Read header */
	read_header (sw, &indexf->header, indexf->DB);


    /* Create a new XML document and root node */
    doc = xmlNewDoc(BAD_CAST "1.0");
    swishe_node = xmlNewNode(NULL, BAD_CAST "swish-e");
    xmlDocSetRootElement(doc, swishe_node);
    
    /* Declare DTD */
    dtd = xmlCreateIntSubset(doc, BAD_CAST "swish-e", NULL, BAD_CAST "swish-e.dtd");
    
    
	for (i = 0; i < indexf->header.metaCounter; i++)
		if (indexf->header.metaEntryArray[i]->metaID > end_meta)
			end_meta = indexf->header.metaEntryArray[i]->metaID;

	meta_used = emalloc (sizeof (int) * (end_meta + 1));

        
    /* _META only reports which tags the words are found in */
	for (i = 0; i <= end_meta; i++)
		meta_used[i] = 0;

    /* scan through all 255 characters */
	for (j = 1; j < 256; j++)
	{
		word[0] = (unsigned char) j;
		word[1] = '\0';
		DB_ReadFirstWordInvertedIndex (sw,
					       word, &resultword, &wordID,
					       indexf->DB);

        /* for each word create a new XML node */        
		while (wordID
		       && (((int) ((unsigned char) resultword[0])) == j))
		{
            /* Add word tag */
            node = xmlNewChild(swishe_node, NULL, BAD_CAST "word", NULL);
            
            /* Add name tag to word */
            sprintf(xmlbuff, "%s",resultword);
            node1 = xmlNewChild(node, NULL, BAD_CAST "name", BAD_CAST xmlbuff);

            /* Read Word's data */
			DB_ReadWordData (sw, wordID, &worddata, &sz_worddata,
					 &saved_bytes, indexf->DB);
			uncompress_worddata (&worddata, &sz_worddata,
					     saved_bytes);

            /* parse and print word's data */
			s = worddata;

			tmpval = uncompress2 (&s);	/* tfrequency */
			metaID = uncompress2 (&s);	/* metaID */
			metadata_length = uncompress2 (&s);

			filenum = 0;
			start = s;
			while (1)
			{	/* Read on all items */
				struct metaEntry *m;
				uncompress_location_values (&s, &flag,
							    &tmpval,
							    &frequency);
				filenum += tmpval;
				posdata =
					(unsigned int *) emalloc (frequency *
								  sizeof
								  (unsigned
								   int));
				uncompress_location_positions (&s, flag,
							       frequency,
							       posdata);

                /* Get DOCPATH from property list */
				if ((m =
				     getPropNameByName (&indexf->header,
							AUTOPROPERTY_DOCPATH)))
				{
					RESULT r;
					DB_RESULTS db_results;
					char *s;
					PropValue *p;

					memset (&r, 0, sizeof (RESULT));
					memset (&db_results, 0,
						sizeof (DB_RESULTS));
					db_results.indexf = indexf;

					r.db_results = &db_results;
					r.filenum = filenum;
					r.fi.filenum = filenum;
					s = getResultPropAsString (&r,
								   m->metaID);
					p = getResultPropValue (&r,
								AUTOPROPERTY_TITLE,
								0);

                    /* Add path tag to word */
                    node1 = xmlNewChild(node, NULL, BAD_CAST "path", BAD_CAST s);

                    /* Add frequency property to word */
                    sprintf(xmlbuff,"%d",frequency);
                    xmlNewProp(node1, BAD_CAST "freq", BAD_CAST xmlbuff);
                    
                    /* Add title property to path if title is a string */
					if (PROP_STRING == p->datatype)
                            xmlNewProp(node1, BAD_CAST "title", BAD_CAST p->value.v_str);
                    
                    /* free results */
                    freeResultPropValue (p);
                    efree (s);

				}
                /* else there was an error looking up metadata */

				efree (posdata);

                /* Check for end of worddata */
				if ((s - worddata) == sz_worddata)
					break;	/* End of worddata */

                /* Check for end of current metaID data */
				if (metadata_length == (s - start))
				{
					filenum = 0;
					metaID = uncompress2 (&s);
					metadata_length = uncompress2 (&s);
					start = s;
				}
			}
            
            /* end of <word> tag */
			efree (worddata);
			efree (resultword);
            
            /* get next word in list */
			DB_ReadNextWordInvertedIndex (sw, word, &resultword,
						      &wordID, indexf->DB);
		}
	}
	DB_EndReadWords (sw, indexf->DB);

	efree (meta_used);

    /* Dump XML to standard output */
    xmlSaveFormatFileEnc("-", doc, "UTF-8", 1);

    /* Free the document */
    xmlFreeDoc(doc);

    /* Free global variables */
    xmlCleanupParser();

    /* Dump memory if in debug mode */
    xmlMemoryDump();
    
#else
    printf("Tree support disabled in libxml2.\n");
#endif
}
