• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

sopranoindexreader.cpp

Go to the documentation of this file.
00001 /*
00002    Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
00003 
00004    This library is free software; you can redistribute it and/or
00005    modify it under the terms of the GNU General Public License as
00006    published by the Free Software Foundation; either version 2 of
00007    the License, or (at your option) any later version.
00008 
00009    This library is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012    Library General Public License for more details.
00013 
00014    You should have received a copy of the GNU General Public License
00015    along with this library; see the file COPYING.  If not, write to
00016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017    Boston, MA 02110-1301, USA.
00018 */
00019 
00020 #include "sopranoindexreader.h"
00021 #include "tstring.h"
00022 #include <strigi/query.h>
00023 #include <strigi/queryparser.h>
00024 #include <strigi/fieldtypes.h>
00025 #include "util.h"
00026 
00027 #include <Soprano/Soprano>
00028 #include <Soprano/Index/IndexFilterModel>
00029 #include <Soprano/Index/CLuceneIndex>
00030 #include <Soprano/Vocabulary/XMLSchema>
00031 
00032 #include <map>
00033 #include <utility>
00034 #include <sstream>
00035 
00036 #include <CLucene.h>
00037 
00038 #include <QtCore/QThread>
00039 #include <QtCore/QDateTime>
00040 #include <QtCore/QDebug>
00041 #include <QtCore/QString>
00042 #include <QtCore/QLatin1String>
00043 #include <QtCore/QFile>
00044 
00045 
00046 using namespace Soprano;
00047 
00048 
00049 static lucene::index::Term* createWildCardTerm( const TString& name,
00050                                                 const string& value );
00051 static lucene::index::Term* createTerm( const TString& name,
00052                                         const string& value );
00053 static lucene::index::Term* createKeywordTerm( const TString& name,
00054                                                const string& value );
00055 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
00056 static lucene::search::Query* createQuery( const Strigi::Query& query );
00057 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
00058 static lucene::search::Query* createSingleFieldQuery( const string& field,
00059                                                       const Strigi::Query& query );
00060 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
00061 
00062 #if 0
00063 static QString luceneQueryEscape( const QString& s )
00064 {
00065     /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~  : \ */
00066 
00067     static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
00068     QString es( s );
00069     es.replace( rx, "\\\\1" );
00070     return es;
00071 }
00072 #endif
00073 
00074 static lucene::index::Term* createWildCardTerm( const TString& name,
00075                                                 const string& value )
00076 {
00077     TString v = TString::fromUtf8( value.c_str() );
00078     return _CLNEW lucene::index::Term( name.data(), v.data() );
00079 }
00080 
00081 static lucene::index::Term* createTerm( const TString& name,
00082                                         const string& value )
00083 {
00084     qDebug() << "createTerm" << name << value.c_str();
00085 
00086     TString v = TString::fromUtf8( value.c_str() );
00087 
00088     lucene::util::StringReader sr( v.data() );
00089     lucene::analysis::standard::StandardAnalyzer a;
00090     lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
00091     lucene::analysis::Token* to = ts->next();
00092     const wchar_t *tv;
00093     if (to) {
00094         tv = to->termText();
00095     } else {
00096         tv = v.data();
00097     }
00098     lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
00099     if (to) {
00100         _CLDELETE(to);
00101     }
00102     _CLDELETE(ts);
00103     return t;
00104 }
00105 
00106 static lucene::index::Term* createKeywordTerm( const TString& name,
00107                                                const string& value )
00108 {
00109     TString v = TString::fromUtf8( value.c_str() );
00110     lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
00111     return t;
00112 }
00113 
00114 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
00115 {
00116     lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00117     bool isAnd = query.type() == Strigi::Query::And;
00118     const vector<Strigi::Query>& sub = query.subQueries();
00119     for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
00120         lucene::search::Query* q = createQuery(*i);
00121         bq->add(q, true, isAnd, i->negate());
00122     }
00123     return bq;
00124 }
00125 
00126 static lucene::search::Query* createQuery( const Strigi::Query& query )
00127 {
00128     return query.subQueries().size()
00129         ? createBooleanQuery(query)
00130         : createSimpleQuery(query);
00131 }
00132 
00133 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
00134 {
00135     switch (query.fields().size()) {
00136     case 0:  return createSingleFieldQuery("text", query);
00137     case 1:  return createSingleFieldQuery(query.fields()[0], query);
00138     default: return createMultiFieldQuery(query);
00139     }
00140 }
00141 
00142 static lucene::search::Query* createSingleFieldQuery( const string& field,
00143                                                       const Strigi::Query& query ) {
00144     qDebug() << "Creating single field query: " << field.c_str();
00145     TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
00146     lucene::search::Query* q;
00147     lucene::index::Term* t;
00148     const string& val = query.term().string();
00149     switch (query.type()) {
00150     case Strigi::Query::LessThan:
00151           t = createTerm(fieldname, val.c_str());
00152           q = _CLNEW lucene::search::RangeQuery(0, t, false);
00153           break;
00154     case Strigi::Query::LessThanEquals:
00155           t = createTerm(fieldname, query.term().string());
00156           q = _CLNEW lucene::search::RangeQuery(0, t, true);
00157           break;
00158     case Strigi::Query::GreaterThan:
00159           t = createTerm(fieldname, query.term().string());
00160           q = _CLNEW lucene::search::RangeQuery(t, 0, false);
00161           break;
00162     case Strigi::Query::GreaterThanEquals:
00163           t = createTerm(fieldname, query.term().string());
00164           q = _CLNEW lucene::search::RangeQuery(t, 0, true);
00165           break;
00166     case Strigi::Query::Keyword:
00167           t = createKeywordTerm(fieldname, query.term().string());
00168           q = _CLNEW lucene::search::TermQuery(t);
00169           break;
00170     default:
00171           if (strpbrk(val.c_str(), "*?")) {
00172                t = createWildCardTerm(fieldname, val);
00173                q = _CLNEW lucene::search::WildcardQuery(t);
00174           } else {
00175                t = createTerm(fieldname, val);
00176                q = _CLNEW lucene::search::TermQuery(t);
00177           }
00178     }
00179     _CLDECDELETE(t);
00180     return q;
00181 }
00182 
00183 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
00184 {
00185     lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00186     for (vector<string>::const_iterator i = query.fields().begin();
00187             i != query.fields().end(); ++i) {
00188         lucene::search::Query* q = createSingleFieldQuery(*i, query);
00189         bq->add(q, true, false, false);
00190     }
00191     return bq;
00192 }
00193 
00194 
00195 static QString escapeLiteralForSparqlQuery( const QString& s )
00196 {
00197     return QString( s ).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
00198 }
00199 
00200 
00201 class Strigi::Soprano::IndexReader::Private
00202 {
00203 public:
00204     bool createDocument( const Node& res, IndexedDocument& doc ) {
00205         StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
00206         if ( it.lastError() ) {
00207             return false;
00208         }
00209 
00210         // use the resource URI as fallback file URI
00211         doc.uri = res.uri().toLocalFile().toUtf8().data();
00212 
00213         while ( it.next() ) {
00214             Statement s = *it;
00215             if ( s.object().isLiteral() ) {
00216                 std::string fieldName = Util::fieldName( s.predicate().uri() );
00217                 std::string value = s.object().toString().toUtf8().data();
00218 
00219                 if (fieldName == "text") {
00220                     doc.fragment = value;
00221                 }
00222                 else if (fieldName == FieldRegister::pathFieldName) {
00223                     qDebug() << "Setting IndexedDocument uri=" << value.c_str();
00224                     doc.uri = value;
00225                 }
00226                 else if (fieldName == FieldRegister::mimetypeFieldName) {
00227                     doc.mimetype = value;
00228                 }
00229                 else if (fieldName == FieldRegister::mtimeFieldName) {
00230                     // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00231                     if ( s.object().literal().isDateTime() ) {
00232                         doc.mtime = s.object().literal().toDateTime().toTime_t();
00233                     }
00234                     else {
00235                         doc.mtime = s.object().literal().toUnsignedInt();
00236                     }
00237                 }
00238                 else if (fieldName == FieldRegister::sizeFieldName) {
00239                     doc.size = s.object().literal().toInt64();
00240                 }
00241                 else {
00242                     doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
00243                 }
00244             }
00245             else {
00246                 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
00247                 // not literal statements
00248             }
00249         }
00250 
00251         return true;
00252     }
00253 
00254 //    ::Soprano::Index::IndexFilterModel* repository;
00255     ::Soprano::Model* repository;
00256 };
00257 
00258 
00259 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
00260     : Strigi::IndexReader()
00261 {
00262     qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
00263     d = new Private;
00264     d->repository = model;
00265 }
00266 
00267 
00268 Strigi::Soprano::IndexReader::~IndexReader()
00269 {
00270     qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
00271     delete d;
00272 }
00273 
00274 
00275 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
00276 {
00277     qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
00278 
00279     lucene::search::Query* q = createQuery( query );
00280     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
00281                                                                        ::Soprano::Query::QueryLanguageUser,
00282                                                                        QLatin1String( "lucene" ) );
00283 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
00284     int s = 0;
00285     while ( hits.next() ) {
00286         qDebug() << "Query hit:" << hits.binding( 0 );
00287         ++s;
00288     }
00289     _CLDELETE(q);
00290     return s;
00291 }
00292 
00293 
00294 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
00295                                             const std::vector<std::string>& fields,
00296                                             const std::vector<Strigi::Variant::Type>& types,
00297                                             std::vector<std::vector<Strigi::Variant> >& result,
00298                                             int off, int max )
00299 {
00300     qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
00301     lucene::search::Query* bq = createQuery( query );
00302     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00303                                                                        ::Soprano::Query::QueryLanguageUser,
00304                                                                        QLatin1String( "lucene" ) );
00305 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
00306 
00307     int i = -1;
00308     while ( hits.next() ) {
00309         ++i;
00310         if ( i < off ) {
00311             continue;
00312         }
00313         if ( i > max ) {
00314             break;
00315         }
00316 
00317 //        ::Soprano::Index::QueryHit hit = *hits;
00318         std::vector<Strigi::Variant> resultRow;
00319         std::vector<std::string>::const_iterator fieldIt = fields.begin();
00320         std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
00321         while ( fieldIt != fields.end() ) {
00322             if ( typesIt == types.end() ) {
00323                 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
00324                 return;
00325             }
00326 
00327             StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
00328                                                                              Util::fieldUri( *fieldIt ),
00329                                                                              Node() ) );
00330             // FIXME: what if we have a field with a cardinality > 1?
00331             if ( it.next() ) {
00332                 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
00333             }
00334             else {
00335                 resultRow.push_back( Strigi::Variant() );
00336             }
00337 
00338             ++fieldIt;
00339             ++typesIt;
00340         }
00341 
00342         result.push_back( resultRow );
00343     }
00344     _CLDELETE(bq);
00345 }
00346 
00347 
00348 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
00349 {
00350     qDebug() << "IndexReader::query in thread" << QThread::currentThread();
00351     vector<IndexedDocument> results;
00352     lucene::search::Query* bq = createQuery( query );
00353     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00354                                                                        ::Soprano::Query::QueryLanguageUser,
00355                                                                        QLatin1String( "lucene" ) );
00356 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
00357 
00358     int i = -1;
00359     while ( hits.next() ) {
00360         ++i;
00361         if ( i < off ) {
00362             continue;
00363         }
00364         if ( i > max ) {
00365             break;
00366         }
00367 
00368         IndexedDocument result;
00369 //        ::Soprano::Index::QueryHit hit = *hits;
00370         result.score = hits.binding( 1 ).literal().toDouble();
00371         if ( d->createDocument( hits.binding( 0 ), result ) ) {
00372             results.push_back( result );
00373         }
00374         else {
00375             qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
00376         }
00377     }
00378     _CLDELETE(bq);
00379     return results;
00380 }
00381 
00382 
00383 // an empty parent url is perfectly valid as strigi stores a parent url for everything
00384 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
00385                                                 std::map<std::string, time_t>& children )
00386 {
00387 //    qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
00388     QString query = QString( "select distinct ?path ?mtime where { "
00389                              "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
00390                              "?r <%4> ?mtime . "
00391                              "?r <%5> ?path . "
00392                              "}")
00393                     .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString(),
00394                           escapeLiteralForSparqlQuery( QString::fromUtf8( parent.c_str() ) ),
00395                           Vocabulary::XMLSchema::string().toString(),
00396                           Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
00397                           Util::fieldUri( FieldRegister::pathFieldName ).toString(),
00398                           Node( QUrl::fromLocalFile( QFile::decodeName( parent.c_str() ) ) ).toN3() );
00399 
00400 //    qDebug() << "running getChildren query:" << query;
00401 
00402     QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
00403 
00404     while ( result.next() ) {
00405         Node pathNode = result.binding( "path" );
00406         Node mTimeNode = result.binding( "mtime" );
00407 //        qDebug() << "file in index: " << pathNode.toString() << "mtime:" << mTimeNode.literal().toDateTime() << "(" << mTimeNode.literal().toDateTime().toTime_t() << ")";
00408 
00409         // be backwards compatible in case there are paths left encoded as literals
00410         std::string path;
00411         if ( pathNode.isLiteral() )
00412             path = pathNode.toString().toUtf8().data();
00413         else
00414             path = QFile::encodeName( pathNode.uri().toLocalFile() ).data();
00415 
00416         // Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00417         if ( mTimeNode.literal().isDateTime() ) {
00418             children[path] = mTimeNode.literal().toDateTime().toTime_t();
00419         }
00420         else {
00421             children[path] = mTimeNode.literal().toUnsignedInt();
00422         }
00423     }
00424 }
00425 
00426 
00427 int32_t Strigi::Soprano::IndexReader::countDocuments()
00428 {
00429     qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
00430     // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
00431     return 0;
00432 }
00433 
00434 
00435 int32_t Strigi::Soprano::IndexReader::countWords()
00436 {
00437     qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
00438     // FIXME: what to do here? use the index? Count the predicates?
00439     return -1;
00440 }
00441 
00442 
00443 int64_t Strigi::Soprano::IndexReader::indexSize()
00444 {
00445     qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
00446     return d->repository->statementCount();
00447 }
00448 
00449 
00450 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
00451 {
00452 //    qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
00453     QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
00454                     .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
00455                           Util::fieldUri( FieldRegister::pathFieldName ).toString(),
00456                           escapeLiteralForSparqlQuery( QString::fromUtf8( uri.c_str() ) ),
00457                           Vocabulary::XMLSchema::string().toString() );
00458 
00459     qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
00460 
00461     QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
00462 
00463     time_t mtime = 0;
00464     if ( it.next() ) {
00465         ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
00466 
00467         // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00468         if ( val.isDateTime() ) {
00469             mtime = val.toDateTime().toTime_t();
00470         }
00471         else {
00472             mtime = val.toUnsignedInt();
00473         }
00474     }
00475     return mtime;
00476 }
00477 
00478 
00479 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
00480 {
00481     qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
00482     // This is a weird method
00483     // Our list of field names (the predicates) is probably awefully long.
00484 
00485     std::vector<std::string> fields;
00486     QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QueryLanguageSparql );
00487     while ( it.next() ) {
00488         fields.push_back( Util::fieldName( it.binding("p").uri() ) );
00489     }
00490     return fields;
00491 }
00492 
00493 
00494 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
00495                                                                                        const std::string& fieldname,
00496                                                                                        const std::string& labeltype )
00497 {
00498     Q_UNUSED(query);
00499     Q_UNUSED(fieldname);
00500     Q_UNUSED(labeltype);
00501 
00502     // FIXME: what is meant by fieldname and labeltype?
00503     qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
00504     // IMPLEMENTME? Seems not like a very important method though.
00505     return std::vector<std::pair<std::string,uint32_t> >();
00506 }
00507 
00508 
00509 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
00510                                                      const std::vector<std::string>& fieldnames)
00511 {
00512     Q_UNUSED(keywordprefix);
00513     Q_UNUSED(fieldnames);
00514 
00515     qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
00516     // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
00517     return 2;
00518 }
00519 
00520 
00521 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
00522                                                                  const std::vector<std::string>& fieldnames,
00523                                                                  uint32_t max, uint32_t offset )
00524 {
00525     Q_UNUSED(keywordmatch);
00526     Q_UNUSED(fieldnames);
00527     Q_UNUSED(max);
00528     Q_UNUSED(offset);
00529 
00530     qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
00531     // IMPLEMENTME? Seems like a rarely used method...
00532     return std::vector<std::string>();
00533 }

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal