/* $Id: $ ** ** swish2xml.c - dump a binary swish-e database to XML ** mostly compatible with jsFind ** ** build from within swish-e/src dir ** gcc -o swish2xml -I. -Ireplace -I /usr/include/libxml2/ -L.libs -lswish-e -lxml2 swish2xml.c ** ./swish2xml > swish.xml ** ** This program and library is free software; you can redistribute it and/or ** modify it under the terms of the GNU (Library) General Public License ** as published by the Free Software Foundation; either version 2 ** of the License, or any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU (Library) General Public License for more details. ** ** You should have received a copy of the GNU (Library) General Public License ** along with this program; if not, write to the Free Software ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. **--------------------------------------------------------- */ #include #include #include #include "swish.h" #include "mem.h" #include "search.h" #include "merge.h" #include "docprop.h" #include "metanames.h" #include "db.h" #include #include void DB_xml_dump (SWISH * sw, IndexFILE * indexf); int main (int argc, char **argv) { SWISH *sw; setlocale (LC_CTYPE, ""); SwishErrorsToStderr (); /* new SWISH handle */ sw = SwishNew (); /* TODO: properly parse command line arguments */ addindexfile (sw, argv[1] && *(argv[1]) ? argv[1] : INDEXFILE); /* Set the default index file */ if (sw->indexlist == NULL) addindexfile (sw, INDEXFILE); /* process each index file */ while (sw->indexlist != NULL) { DB_xml_dump (sw, sw->indexlist); putchar ('\n'); sw->indexlist = sw->indexlist->next; } /* close SWISH handle */ SwishClose (sw); exit (0); } /* Print out the data in an index DB */ void DB_xml_dump (SWISH * sw, IndexFILE * indexf) { #ifdef LIBXML_TREE_ENABLED int i, j, c, fieldnum, frequency, metaID, tmpval, printedword, filenum; unsigned int *posdata; int metadata_length; char word[2]; char *resultword; unsigned char *worddata, *s, *start, flag; int sz_worddata, saved_bytes; sw_off_t wordID; int *meta_used; int end_meta = 0; xmlDocPtr doc = NULL; /* document pointer */ xmlNodePtr swishe_node = NULL, node = NULL, node1 = NULL; /* node pointers */ xmlDtdPtr dtd = NULL; /* DTD pointer */ char xmlbuff[MAXSTRLEN + 1]; LIBXML_TEST_VERSION; metaID = 0; metadata_length = 0; c = 0; frequency = 0; /* Open Database */ indexf->DB = DB_Open (sw, indexf->line, DB_READ); if (sw->lasterror) SwishAbortLastError (sw); /* Read header */ read_header (sw, &indexf->header, indexf->DB); /* Create a new XML document and root node */ doc = xmlNewDoc(BAD_CAST "1.0"); swishe_node = xmlNewNode(NULL, BAD_CAST "swish-e"); xmlDocSetRootElement(doc, swishe_node); /* Declare DTD */ dtd = xmlCreateIntSubset(doc, BAD_CAST "swish-e", NULL, BAD_CAST "swish-e.dtd"); for (i = 0; i < indexf->header.metaCounter; i++) if (indexf->header.metaEntryArray[i]->metaID > end_meta) end_meta = indexf->header.metaEntryArray[i]->metaID; meta_used = emalloc (sizeof (int) * (end_meta + 1)); /* _META only reports which tags the words are found in */ for (i = 0; i <= end_meta; i++) meta_used[i] = 0; /* scan through all 255 characters */ for (j = 1; j < 256; j++) { word[0] = (unsigned char) j; word[1] = '\0'; DB_ReadFirstWordInvertedIndex (sw, word, &resultword, &wordID, indexf->DB); /* for each word create a new XML node */ while (wordID && (((int) ((unsigned char) resultword[0])) == j)) { /* Add word tag */ node = xmlNewChild(swishe_node, NULL, BAD_CAST "word", NULL); /* Add name tag to word */ sprintf(xmlbuff, "%s",resultword); node1 = xmlNewChild(node, NULL, BAD_CAST "name", BAD_CAST xmlbuff); /* Read Word's data */ DB_ReadWordData (sw, wordID, &worddata, &sz_worddata, &saved_bytes, indexf->DB); uncompress_worddata (&worddata, &sz_worddata, saved_bytes); /* parse and print word's data */ s = worddata; tmpval = uncompress2 (&s); /* tfrequency */ metaID = uncompress2 (&s); /* metaID */ metadata_length = uncompress2 (&s); filenum = 0; start = s; while (1) { /* Read on all items */ struct metaEntry *m; uncompress_location_values (&s, &flag, &tmpval, &frequency); filenum += tmpval; posdata = (unsigned int *) emalloc (frequency * sizeof (unsigned int)); uncompress_location_positions (&s, flag, frequency, posdata); /* Get DOCPATH from property list */ if ((m = getPropNameByName (&indexf->header, AUTOPROPERTY_DOCPATH))) { RESULT r; DB_RESULTS db_results; char *s; PropValue *p; memset (&r, 0, sizeof (RESULT)); memset (&db_results, 0, sizeof (DB_RESULTS)); db_results.indexf = indexf; r.db_results = &db_results; r.filenum = filenum; r.fi.filenum = filenum; s = getResultPropAsString (&r, m->metaID); p = getResultPropValue (&r, AUTOPROPERTY_TITLE, 0); /* Add path tag to word */ node1 = xmlNewChild(node, NULL, BAD_CAST "path", BAD_CAST s); /* Add frequency property to word */ sprintf(xmlbuff,"%d",frequency); xmlNewProp(node1, BAD_CAST "freq", BAD_CAST xmlbuff); /* Add title property to path if title is a string */ if (PROP_STRING == p->datatype) xmlNewProp(node1, BAD_CAST "title", BAD_CAST p->value.v_str); /* free results */ freeResultPropValue (p); efree (s); } /* else there was an error looking up metadata */ efree (posdata); /* Check for end of worddata */ if ((s - worddata) == sz_worddata) break; /* End of worddata */ /* Check for end of current metaID data */ if (metadata_length == (s - start)) { filenum = 0; metaID = uncompress2 (&s); metadata_length = uncompress2 (&s); start = s; } } /* end of tag */ efree (worddata); efree (resultword); /* get next word in list */ DB_ReadNextWordInvertedIndex (sw, word, &resultword, &wordID, indexf->DB); } } DB_EndReadWords (sw, indexf->DB); efree (meta_used); /* Dump XML to standard output */ xmlSaveFormatFileEnc("-", doc, "UTF-8", 1); /* Free the document */ xmlFreeDoc(doc); /* Free global variables */ xmlCleanupParser(); /* Dump memory if in debug mode */ xmlMemoryDump(); #else printf("Tree support disabled in libxml2.\n"); #endif }