• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

searchthread.cpp

Go to the documentation of this file.
00001 /*
00002   This file is part of the Nepomuk KDE project.
00003   Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
00004 
00005   This library is free software; you can redistribute it and/or
00006   modify it under the terms of the GNU Library General Public
00007   License version 2 as published by the Free Software Foundation.
00008 
00009   This library is distributed in the hope that it will be useful,
00010   but WITHOUT ANY WARRANTY; without even the implied warranty of
00011   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012   Library General Public License for more details.
00013 
00014   You should have received a copy of the GNU Library General Public License
00015   along with this library; see the file COPYING.LIB.  If not, write to
00016   the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017   Boston, MA 02110-1301, USA.
00018 */
00019 
00020 #include "searchthread.h"
00021 #include "term.h"
00022 #include "qurlhash.h"
00023 
00024 #include <Nepomuk/ResourceManager>
00025 #include <Nepomuk/Resource>
00026 #include <Nepomuk/Types/Property>
00027 #include <Nepomuk/Types/Class>
00028 #include <Nepomuk/Types/Literal>
00029 
00030 #include <Soprano/Version>
00031 #include <Soprano/Model>
00032 #include <Soprano/QueryResultIterator>
00033 #include <Soprano/Node>
00034 #include <Soprano/Statement>
00035 #include <Soprano/LiteralValue>
00036 #include <Soprano/StatementIterator>
00037 #include <Soprano/Vocabulary/RDF>
00038 #include <Soprano/Vocabulary/RDFS>
00039 #include <Soprano/Vocabulary/NRL>
00040 #include <Soprano/Vocabulary/NAO>
00041 #include <Soprano/Vocabulary/XMLSchema>
00042 #include <Soprano/Vocabulary/OWL>
00043 
00044 #include <KDebug>
00045 
00046 #include <QtCore/QTime>
00047 
00048 
00049 
00050 // FIXME: With our cutoff score we might miss results that are hit multiple times and thus, would get their
00051 //        score increased
00052 
00053 #warning Make query optimization methods return an invalid term if the query cannot be resolved and handle this as no results
00054 
00055 using namespace Soprano;
00056 
00057 namespace {
00062     const int MAX_RESOURCES = 4;
00063 
00064 
00065     void mergeInResult( QHash<QUrl, Nepomuk::Search::Result>& results, const Nepomuk::Search::Result& resource ) {
00066         QHash<QUrl, Nepomuk::Search::Result>::iterator old = results.find( resource.resourceUri() );
00067         if ( old == results.end() ) {
00068             results.insert( resource.resourceUri(), resource );
00069         }
00070         else {
00071             // FIXME: how do we join the scores properly? Is adding a good idea? It can certainly not be multiplication!
00072             Nepomuk::Search::Result& result = *old;
00073             result.setScore( result.score() + resource.score() );
00074         }
00075     }
00076 
00077     void mergeInResults( QHash<QUrl, Nepomuk::Search::Result>& results, const QHash<QUrl, Nepomuk::Search::Result>& otherResults ) {
00078         for ( QHash<QUrl, Nepomuk::Search::Result>::const_iterator it = otherResults.constBegin();
00079               it != otherResults.constEnd(); ++it ) {
00080             mergeInResult( results, it.value() );
00081         }
00082     }
00083 
00084     // This is a copy of Soprano::Index::IndexFilterModel::encodeStringForLuceneQuery
00085     // which we do not use to prevent linking to sopranoindex
00086     QString luceneQueryEscape( const QString& s ) {
00087         /* Chars to escape: + - && || ! ( ) { } [ ] ^ " ~  : \ */
00088 
00089         static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
00090         QString es( s );
00091         es.replace( rx, "\\\\1" );
00092         return es;
00093     }
00094 
00095     QString luceneQueryEscape( const QUrl& s ) {
00096         return luceneQueryEscape( QString::fromAscii( s.toEncoded() ) );
00097     }
00098 
00099     QString createLuceneLiteralQuery( const QString& escaped ) {
00100         if ( escaped.contains( QRegExp( "\\s" ) ) ) {
00101             return "\"" + escaped + "\"";
00102         }
00103         else {
00104             return escaped;
00105         }
00106     }
00107 
00108     QString createLuceneQuery( const Nepomuk::Search::SearchNode& node ) {
00109         if ( node.term.type() == Nepomuk::Search::Term::LiteralTerm ) {
00110             return createLuceneLiteralQuery( luceneQueryEscape( node.term.value().toString() ) );
00111         }
00112         else if ( node.term.type() == Nepomuk::Search::Term::ComparisonTerm ) {
00113             return luceneQueryEscape( node.term.property() ) + ':' + createLuceneLiteralQuery( luceneQueryEscape( node.term.subTerms().first().value().toString() ) );
00114         }
00115         else {
00116             Q_ASSERT( node.term.type() == Nepomuk::Search::Term::AndTerm ||
00117                       node.term.type() == Nepomuk::Search::Term::OrTerm );
00118 
00119             QStringList sq;
00120             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
00121                 sq += createLuceneQuery( n );
00122             }
00123             if ( node.term.type() == Nepomuk::Search::Term::AndTerm ) {
00124                 return " ( " + sq.join( " AND " ) + " ) ";
00125             }
00126             else {
00127                 return " ( " + sq.join( " OR " ) + " ) ";
00128             }
00129         }
00130     }
00131 
00132     QString comparatorString( Nepomuk::Search::Term::Comparator c ) {
00133         switch( c ) {
00134         case Nepomuk::Search::Term::Contains:
00135             return ":";
00136         case Nepomuk::Search::Term::Equal:
00137             return "=";
00138         case Nepomuk::Search::Term::Greater:
00139             return ">";
00140         case Nepomuk::Search::Term::Smaller:
00141             return "<";
00142         case Nepomuk::Search::Term::GreaterOrEqual:
00143             return ">=";
00144         case Nepomuk::Search::Term::SmallerOrEqual:
00145             return "<=";
00146         }
00147         // make gcc happy
00148         return QString();
00149     }
00150 
00151 
00152     bool isNumberLiteralValue( const Soprano::LiteralValue& value ) {
00153         return value.isInt() || value.isInt64() || value.isUnsignedInt() || value.isUnsignedInt64() || value.isDouble();
00154     }
00155 
00156 
00157     QString createGraphPattern( const Nepomuk::Search::SearchNode& node, int& varCnt, const QString& varName = QString( "?r" ) )
00158     {
00159         switch( node.term.type() ) {
00160         case Nepomuk::Search::Term::ComparisonTerm: {
00161 
00162             Nepomuk::Search::Term subTerm( node.term.subTerms().first() );
00163 
00164             //
00165             // is the subterm (we only support one ATM) a final term (no further subterms)
00166             // -> actually match the literal or resource
00167             //
00168             if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ||
00169                  subTerm.type() == Nepomuk::Search::Term::LiteralTerm ) {
00170                 if( node.term.comparator() != Nepomuk::Search::Term::Equal ) {
00171                     // For numbers there is no need for quotes + this way we can handle all the xsd decimal types
00172                     // FIXME: it may be necessary to escape stuff
00173                     QString filter = QString( "?var%1 %2 " )
00174                                      .arg( ++varCnt )
00175                                      .arg( comparatorString( node.term.comparator() ) );
00176                     if ( isNumberLiteralValue( subTerm.value() ) ) {
00177                         filter += subTerm.value().toString();
00178                     }
00179                     else {
00180                         Nepomuk::Types::Property prop( node.term.property() );
00181                         filter += QString( "\"%1\"" ).arg( subTerm.value().toString() );
00182                         if ( prop.literalRangeType().dataTypeUri().isValid() )
00183                             filter += QString( "^^<%1>" ).arg( prop.literalRangeType().dataTypeUri().toString() );
00184                     }
00185 
00186                     return QString( "%1 <%2> ?var%3 . FILTER(%4) . " )
00187                         .arg( varName )
00188                         .arg( QString::fromAscii( node.term.property().toEncoded() ) )
00189                         .arg( varCnt )
00190                         .arg( filter );
00191                 }
00192                 else {
00193                     if ( subTerm.type() == Nepomuk::Search::Term::ResourceTerm ) {
00194                         return QString( "%1 <%2> <%3> . " )
00195                             .arg( varName )
00196                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
00197                             .arg( QString::fromAscii( subTerm.resource().toEncoded() ) );
00198                     }
00199                     else if ( Nepomuk::Types::Property( node.term.property() ).range().isValid() ) {
00200                         return QString( "%7 <%1> ?x . { ?x <%2> \"%3\"^^<%4> . } UNION { ?x <%5> \"%3\"^^<%4>.  } UNION { ?x <%6> \"%3\"^^<%4> . }" )
00201                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
00202                             .arg( Soprano::Vocabulary::RDFS::label().toString() )
00203                             .arg( subTerm.value().toString() )
00204                             .arg( Soprano::Vocabulary::XMLSchema::string().toString() )
00205                             .arg( Soprano::Vocabulary::NAO::prefLabel().toString() )
00206                             .arg( Soprano::Vocabulary::NAO::identifier().toString() )
00207                             .arg( varName );
00208                     }
00209                     else {
00210                         return QString( "%1 <%2> \"%3\"^^<%4> . " )
00211                             .arg( varName )
00212                             .arg( QString::fromAscii( node.term.property().toEncoded() ) )
00213                             .arg( subTerm.value().toString() )
00214                             .arg( Nepomuk::Types::Property( node.term.property() ).literalRangeType().dataTypeUri().toString() );
00215                     }
00216                 }
00217             }
00218 
00219             //
00220             // Is the subterm not final, i.e. has further subterms
00221             // -> combine graph pattern with subterm graph pattern
00222             //
00223             else {
00224                 QString bridgeVarName = QString( "?var%1" ).arg( ++varCnt );
00225                 return QString( "%1 <%2> %3 . " )
00226                     .arg( varName )
00227                     .arg( QString::fromAscii( node.term.property().toEncoded() ) )
00228                     .arg( bridgeVarName )
00229                     + createGraphPattern( node.subNodes.first(), varCnt, bridgeVarName );
00230             }
00231         }
00232 
00233         case Nepomuk::Search::Term::AndTerm: {
00234             QString s( "{ " );
00235             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
00236                 s += createGraphPattern( n, varCnt );
00237             }
00238             s += "} ";
00239             return s;
00240         }
00241 
00242         case Nepomuk::Search::Term::OrTerm: {
00243             QStringList s;
00244             foreach( const Nepomuk::Search::SearchNode& n, node.subNodes ) {
00245                 s += createGraphPattern( n, varCnt );
00246             }
00247             Q_ASSERT( !s.isEmpty() );
00248             return "{ " + s.join( " } UNION { " ) + " } ";
00249         }
00250 
00251         default:
00252             Q_ASSERT_X( 0, "createGraphPattern", "unsupported Term type" );
00253         }
00254 
00255         return QString();
00256     }
00257 }
00258 
00259 
00260 Nepomuk::Search::SearchThread::SearchThread( QObject* parent )
00261     : QThread( parent )
00262 {
00263 }
00264 
00265 
00266 Nepomuk::Search::SearchThread::~SearchThread()
00267 {
00268 }
00269 
00270 
00271 void Nepomuk::Search::SearchThread::query( const Query& term, double cutOffScore )
00272 {
00273     if( isRunning() ) {
00274         cancel();
00275     }
00276 
00277     kDebug() << term << cutOffScore;
00278 
00279     m_canceled = false;
00280     m_searchTerm = term;
00281     m_cutOffScore = cutOffScore;
00282     m_numResults = 0;
00283 
00284     start();
00285 }
00286 
00287 
00288 void Nepomuk::Search::SearchThread::cancel()
00289 {
00290     m_canceled = true;
00291     wait();
00292 }
00293 
00294 
00295 void Nepomuk::Search::SearchThread::run()
00296 {
00297     QTime time;
00298     time.start();
00299 
00300     if ( m_searchTerm.type() == Query::PlainQuery ) {
00301         kDebug() << "Plain Query:    " << m_searchTerm;
00302         Term t = resolveFields( m_searchTerm.term() );
00303         kDebug() << "Fields resolved:" << t;
00304         t = resolveValues( t );
00305         kDebug() << "Values resolved:" << t;
00306         t = optimize( t );
00307         kDebug() << "Optimized query:" << t;
00308 
00309         search( splitLuceneSparql( t ) /*optimize( resolveValues( resolveFields( m_searchTerm ) ) )*/, 1.0, true );
00310     }
00311     else {
00312         // FIXME: once we have the Soprano query API it should be simple to add the requestProperties here
00313         // for now we do it the hacky way
00314         QString query = m_searchTerm.sparqlQuery();
00315         int pos = query.indexOf( QLatin1String( "where" ) );
00316         if ( pos > 0 ) {
00317             query.insert( pos, buildRequestPropertyVariableList() + ' ' );
00318             pos = query.lastIndexOf( '}' );
00319             if ( pos > 0 ) {
00320                 query.insert( pos, ' ' + buildRequestPropertyPatterns() + ' ' );
00321             }
00322         }
00323 
00324         sparqlQuery( query, 1.0, true );
00325     }
00326 
00327     kDebug() << time.elapsed();
00328 }
00329 
00330 
00331 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveFields( const Term& term )
00332 {
00333     switch( term.type() ) {
00334     case Term::AndTerm:
00335     case Term::OrTerm: {
00336         Term newTerm;
00337         newTerm.setType( term.type() );
00338         QList<Term> terms = term.subTerms();
00339         foreach( const Term& t, terms ) {
00340             if ( m_canceled ) break;
00341             newTerm.addSubTerm( resolveFields( t ) );
00342         }
00343         return newTerm;
00344     }
00345 
00346 
00347     case Term::ComparisonTerm: {
00348         Term newTerm( term );
00349         Term subTerm = term.subTerms().first();
00350         if ( subTerm.type() != Term::LiteralTerm &&
00351              subTerm.type() != Term::ResourceTerm ) {
00352             newTerm.setSubTerms( QList<Term>() << resolveFields( subTerm ) );
00353         }
00354 
00355         if ( !newTerm.property().isValid() ) {
00356             // FIXME: use the score of the field search as boost factors
00357             QList<QUrl> properties = matchFieldName( term.field() );
00358             if ( properties.count() > 0 ) {
00359                 if ( properties.count() == 1 ) {
00360                     newTerm.setProperty( properties.first() );
00361                     return newTerm;
00362                 }
00363                 else {
00364                     Term orTerm;
00365                     orTerm.setType( Term::OrTerm );
00366                     foreach( const QUrl& property, properties ) {
00367                         Term t( newTerm );
00368                         t.setProperty( property );
00369                         orTerm.addSubTerm( t );
00370                     }
00371                     return orTerm;
00372                 }
00373             }
00374             else {
00375                 kDebug() << "Failed to resolve field" << term.field() << "to any property!";
00376                 return Term();
00377             }
00378         }
00379     }
00380 
00381     default:
00382         return term;
00383     }
00384 }
00385 
00386 
00387 // precondition: resolveFields needs to be run before this one as it only touches properties
00388 Nepomuk::Search::Term Nepomuk::Search::SearchThread::resolveValues( const Term& term )
00389 {
00390     switch( term.type() ) {
00391     case Term::AndTerm:
00392     case Term::OrTerm: {
00393         Term newTerm;
00394         newTerm.setType( term.type() );
00395         QList<Term> terms = term.subTerms();
00396         foreach( const Term& t, terms ) {
00397             if ( m_canceled ) break;
00398             newTerm.addSubTerm( resolveValues( t ) );
00399         }
00400         return newTerm;
00401     }
00402 
00403 
00404     case Term::ComparisonTerm: {
00405         // FIXME: we could also handle this via lucene for literals but what is better?
00406         // with lucene we have the additional work of getting the requestProperties
00407 
00408         // FIXME: handle subqueries
00409 
00410         //
00411         // ComparisonTerm Terms can contain subterms that again. We do not support
00412         // arbitrary subterms but only comparator terms. Here we will only resolve the
00413         // last one since all others will be handled in a single SPARQL query.
00414         //
00415         // Also, non-comtains comparators are handled in the SPARQL query as well.
00416         //
00417         // Thus, in the end we only resolve literal contains terms.
00418         //
00419         if ( term.comparator() == Term::Contains &&
00420              term.subTerms().first().type() == Term::LiteralTerm ) {
00421 
00422             Q_ASSERT ( term.property().isValid() );
00423 
00424             // we only need to augment terms that have a property with
00425             // a non-literal range. These will never hit in a lucene query
00426             // anyway
00427             Nepomuk::Types::Property prop( term.property() );
00428             if ( prop.range().isValid() ) {
00429 
00430                 Term orTerm;
00431                 orTerm.setType( Term::OrTerm );
00432 
00433                 // FIXME: cache the results as it is very well possible that we search the same multiple times
00434                 // if resolveFields did create an OR term
00435 
00436                 // rdfs:label has a higher priority than any other property
00437                 // TODO: without being able to query the resource type simple searching for term.value() is waaaaay to slow
00438                 //QString query = QString( "%1:\"%2\"^4 \"%2\"" )
00439                 QString query = QString( "%1:\"%2\" OR %3:\"%2\" OR %4:\"%2\"" )
00440                                 .arg( luceneQueryEscape( Soprano::Vocabulary::RDFS::label() ) )
00441                                 .arg( term.subTerms().first().value().toString() )
00442                                 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::prefLabel() ) )
00443                                 .arg( luceneQueryEscape( Soprano::Vocabulary::NAO::identifier() ) );
00444                 Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query,
00445                                                                                                             Soprano::Query::QueryLanguageUser,
00446                                                                                                             "lucene" );
00447 
00448                 while ( hits.next() ) {
00449                     if ( m_canceled ) break;
00450 
00451                     // FIXME: use the lucene score as boost factor
00452                     QUrl hit = hits.binding( 0 ).uri();
00453                     if ( prop.range().uri() == Soprano::Vocabulary::RDFS::Resource() ||
00454                          Nepomuk::Resource( hit ).hasType( prop.range().uri() ) ) {
00455                         orTerm.addSubTerm( Term( term.property(), hit ) );
00456                         if ( orTerm.subTerms().count() == MAX_RESOURCES ) {
00457                             break;
00458                         }
00459                     }
00460                 }
00461 
00462                 if ( orTerm.subTerms().count() == 1 ) {
00463                     return orTerm.subTerms().first();
00464                 }
00465                 else if ( orTerm.subTerms().count() ) {
00466                     return orTerm;
00467                 }
00468                 else {
00469                     kDebug() << "Failed to match value" << term.subTerms().first().value() << "to any possible resource.";
00470                     return term;
00471                 }
00472             }
00473             else {
00474                 // nothing to do here
00475                 return term;
00476             }
00477         }
00478 
00479         // non-literal term or non-contains term -> handled in SPARQL query
00480         else {
00481             Term newTerm( term );
00482             newTerm.setSubTerms( QList<Term>() << resolveValues( term.subTerms().first() ) );
00483             return newTerm;
00484         }
00485     }
00486 
00487     default:
00488         return term;
00489     }
00490 }
00491 
00492 
00493 Nepomuk::Search::Term Nepomuk::Search::SearchThread::optimize( const Term& term )
00494 {
00495     switch( term.type() ) {
00496     case Term::AndTerm:
00497     case Term::OrTerm: {
00498         QList<Term> subTerms = term.subTerms();
00499         QList<Term> newSubTerms;
00500         QList<Term>::const_iterator end( subTerms.constEnd() );
00501         for ( QList<Term>::const_iterator it = subTerms.constBegin();
00502               it != end; ++it ) {
00503             const Term& t = *it;
00504             Term ot = optimize( t );
00505             if ( ot.type() == term.type() ) {
00506                 newSubTerms += ot.subTerms();
00507             }
00508             else {
00509                 newSubTerms += ot;
00510             }
00511         }
00512         Term newTerm;
00513         newTerm.setType( term.type() );
00514         newTerm.setSubTerms( newSubTerms );
00515         return newTerm;
00516     }
00517 
00518     default:
00519         return term;
00520     }
00521 }
00522 
00523 
00524 Nepomuk::Search::SearchNode Nepomuk::Search::SearchThread::splitLuceneSparql( const Term& term )
00525 {
00526     // Goal: separate the terms into 2 groups: literal and resource which are
00527     // merged with only one AND or OR action. Is that possible?
00528 
00529     // For now we will do this (our query lang does not handle nested queries anyway)
00530     // LiteralTerm    -> one lucene, no sparql
00531     // ComparisonTerm -> one lucene, no sparql (resource contains will be resolved to equality above)
00532     // AndTerm        -> divide all subterms and create two "small" AND terms
00533     // OrTerm         -> divide all subterms and create two "small" OR terms
00534 
00535     switch( term.type() ) {
00536     case Term::LiteralTerm:
00537         return SearchNode( term, SearchNode::Lucene );
00538 
00539     case Term::ComparisonTerm:
00540         if ( term.comparator() == Term::Contains &&
00541              term.subTerms().first().type() == Term::LiteralTerm ) {
00542             // no need for subnides here - we only use the subterm's value
00543             return SearchNode( term, SearchNode::Lucene );
00544         }
00545         else {
00546             // all subnodes are resolved and can be handled in a SPARQL query
00547             SearchNode node( term, SearchNode::Sparql );
00548             node.subNodes += splitLuceneSparql( term.subTerms().first() );
00549             return node;
00550         }
00551 
00552     case Term::AndTerm:
00553     case Term::OrTerm: {
00554         QList<Term> subTerms = term.subTerms();
00555         QList<SearchNode> luceneNodes, sparqlNodes, unknownNodes;
00556 
00557         QList<Term>::const_iterator end( subTerms.constEnd() );
00558         for ( QList<Term>::const_iterator it = subTerms.constBegin();
00559               it != end; ++it ) {
00560             SearchNode node = splitLuceneSparql( *it );
00561             if ( node.type == SearchNode::Lucene ) {
00562                 luceneNodes += node;
00563             }
00564             else if ( node.type == SearchNode::Sparql ) {
00565                 sparqlNodes += node;
00566             }
00567             else {
00568                 unknownNodes += node;
00569             }
00570         }
00571 
00572         if ( luceneNodes.count() && !sparqlNodes.count() && !unknownNodes.count() ) {
00573             return SearchNode( term, SearchNode::Lucene, luceneNodes );
00574         }
00575         else if ( !luceneNodes.count() && sparqlNodes.count() && !unknownNodes.count() ) {
00576             return SearchNode( term, SearchNode::Sparql, sparqlNodes );
00577         }
00578         else if ( !luceneNodes.count() && !sparqlNodes.count() && unknownNodes.count() ) {
00579             return SearchNode( term, SearchNode::Unknown, unknownNodes );
00580         }
00581         else {
00582             Term newTerm;
00583             newTerm.setType( term.type() );
00584             SearchNode andNode( newTerm );
00585             if ( luceneNodes.count() )
00586                 andNode.subNodes += SearchNode( term, SearchNode::Lucene, luceneNodes );
00587             if ( sparqlNodes.count() )
00588                 andNode.subNodes += SearchNode( term, SearchNode::Sparql, sparqlNodes );
00589             if ( unknownNodes.count() )
00590                 andNode.subNodes += SearchNode( term, SearchNode::Unknown, unknownNodes );
00591             return andNode;
00592         }
00593     }
00594 
00595     default:
00596 //        Q_ASSERT_X( 0, "splitLuceneSparql", "invalid term" );
00597         return SearchNode( Term() );
00598     }
00599 }
00600 
00601 
00602 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::search( const SearchNode& node, double baseScore, bool reportResults )
00603 {
00604     if ( node.type == SearchNode::Lucene ) {
00605         return luceneQuery( createLuceneQuery( node ), baseScore, reportResults );
00606     }
00607     else if ( node.type == SearchNode::Sparql ) {
00608         return sparqlQuery( createSparqlQuery( node ), baseScore, reportResults );
00609     }
00610     else if ( node.term.type() == Term::AndTerm ) {
00611         return andSearch( node.subNodes, baseScore, reportResults );
00612     }
00613     else {
00614         return orSearch( node.subNodes, baseScore, reportResults );
00615     }
00616 }
00617 
00618 
00619 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::andSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
00620 {
00621     QHash<QUrl, Result> results;
00622     bool first = true;
00623     foreach( const SearchNode& node, nodes ) {
00624         if ( m_canceled ) break;
00625         // FIXME: the search will restrict the number of results to maxResults although
00626         //        after the merge we might have less
00627         QHash<QUrl, Result> termResults = search( node, baseScore, false );
00628         if ( first ) {
00629             results = termResults;
00630             first = false;
00631         }
00632         else {
00633             // intersect the results
00634             // FIXME: sort by score
00635             QHash<QUrl, Result>::iterator it = results.begin();
00636             while ( it != results.end() ) {
00637                 if ( m_canceled ) break;
00638                 QHash<QUrl, Result>::const_iterator termIt = termResults.constFind( it.key() );
00639                 if ( termIt != termResults.constEnd() ) {
00640                     // update score
00641                     it.value().setScore( it.value().score() + termIt.value().score() );
00642                     ++it;
00643                 }
00644                 else {
00645                     it = results.erase( it );
00646                 }
00647             }
00648         }
00649     }
00650 
00651     if ( reportResults ) {
00652         for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
00653               it != results.constEnd(); ++it ) {
00654             if ( m_canceled ) break;
00655             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
00656                 return results;
00657             }
00658             else {
00659                 ++m_numResults;
00660                 emit newResult( it.value() );
00661             }
00662         }
00663     }
00664 
00665     return results;
00666 }
00667 
00668 
00669 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::orSearch( const QList<SearchNode>& nodes, double baseScore, bool reportResults )
00670 {
00671     QHash<QUrl, Result> results;
00672     foreach( const SearchNode& node, nodes ) {
00673         if ( m_canceled ) break;
00674         // FIXME: sort by score, ie. use the maxResults results with the highest score
00675         mergeInResults( results, search( node, baseScore, reportResults ) );
00676     }
00677     if ( reportResults ) {
00678         for ( QHash<QUrl, Result>::const_iterator it = results.constBegin();
00679               it != results.constEnd(); ++it ) {
00680             if ( m_canceled ) break;
00681             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
00682                 return results;
00683             }
00684             else {
00685                 ++m_numResults;
00686                 emit newResult( it.value() );
00687             }
00688         }
00689     }
00690     return results;
00691 }
00692 
00693 
00694 QList<QUrl> Nepomuk::Search::SearchThread::matchFieldName( const QString& field )
00695 {
00696     kDebug() << field;
00697 
00698     QList<QUrl> results;
00699 
00700     // Step 1: see if we have a direct match to a predicate label
00701     //         there is no need in selecting unused properties
00702     QString query = QString( "select distinct ?p where { "
00703                              "?p <%1> <%2> . "
00704                              "?p <%3> \"%4\"^^<%5> . "
00705                              "?x ?p ?y . }" )
00706                     .arg( Soprano::Vocabulary::RDF::type().toString() )
00707                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
00708                     .arg( Soprano::Vocabulary::RDFS::label().toString() )
00709                     .arg( field )
00710                     .arg( Soprano::Vocabulary::XMLSchema::string().toString() );
00711     kDebug() << "Direct match query:" << query;
00712 
00713     Soprano::QueryResultIterator labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
00714                                                                                                      Soprano::Query::QueryLanguageSparql );
00715     if ( !m_canceled ) {
00716         while ( labelHits.next() ) {
00717             results << labelHits.binding( "p" ).uri();
00718             kDebug() << "Found direct match" << labelHits.binding( "p" ).uri();
00719         }
00720 
00721         if ( results.isEmpty() ) {
00722             // FIXME: how about we have two repositories: one for the ontologies and one for the data.
00723             //        I don't think there will be relations between the RDF or Xesam ontology and some
00724             //        metadata....
00725             //        Because then queries like the one we are doing here will be more performant since
00726             //        we do not search the data itself and do not have to filter
00727             // BUT: What about inference?
00728 
00729             query = QString( "select ?p where { "
00730                              "?p <%1> <%2> . "
00731                              "?p <%3> ?label . "
00732                              "FILTER(REGEX(STR(?label),'%4','i')) . }" )
00733                     .arg( Soprano::Vocabulary::RDF::type().toString() )
00734                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
00735                     .arg( Soprano::Vocabulary::RDFS::label().toString() )
00736                     .arg( field );
00737             kDebug() << "Indirect hit query:" << query;
00738             labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
00739                                                                                 Soprano::Query::QueryLanguageSparql );
00740             QString newQuery;
00741             while ( labelHits.next() ) {
00742                 results << labelHits.binding( "p" ).uri();
00743                 kDebug() << "Found indirect match by label" << labelHits.binding( "p" ).uri();
00744             }
00745         }
00746 
00747 
00748         if ( results.isEmpty() ) {
00749             query = QString( "select ?p where { "
00750                              "?p <%1> <%2> . "
00751                              "FILTER(REGEX(STR(?p),'%3','i')) . }" )
00752                     .arg( Soprano::Vocabulary::RDF::type().toString() )
00753                     .arg( Soprano::Vocabulary::RDF::Property().toString() )
00754                     .arg( field );
00755             kDebug() << "Indirect hit query:" << query;
00756             labelHits = ResourceManager::instance()->mainModel()->executeQuery( query,
00757                                                                                 Soprano::Query::QueryLanguageSparql );
00758             QString newQuery;
00759             while ( labelHits.next() ) {
00760                 results << labelHits.binding( "p" ).uri();
00761                 kDebug() << "Found indirect match by name" << labelHits.binding( "p" ).uri();
00762             }
00763         }
00764     }
00765 
00766     return results;
00767 }
00768 
00769 
00770 QString Nepomuk::Search::SearchThread::createSparqlQuery( const Nepomuk::Search::SearchNode& node )
00771 {
00772     int varCnt = 0;
00773     return QString( "select distinct ?r %1 where { graph ?g { ?r a ?type . } . ?g a <%2> . %3 %4 }" )
00774         .arg( buildRequestPropertyVariableList() )
00775         .arg( Soprano::Vocabulary::NRL::InstanceBase().toString() )
00776         .arg( createGraphPattern( node, varCnt ) )
00777         .arg( buildRequestPropertyPatterns() );
00778 }
00779 
00780 
00781 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::sparqlQuery( const QString& query, double baseScore, bool reportResults )
00782 {
00783     kDebug() << query;
00784 
00785     QHash<QUrl, Result> results;
00786 
00787     Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( query, Soprano::Query::QueryLanguageSparql );
00788     while ( hits.next() ) {
00789         if ( m_canceled ) break;
00790 
00791         Result result = extractResult( hits );
00792         result.setScore( baseScore );
00793 
00794         kDebug() << "Found result:" << result.resourceUri();
00795 
00796         // these are actual direct hits and we can report them right away
00797         if ( reportResults ) {
00798             if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
00799                 return results;
00800             }
00801             else {
00802                 ++m_numResults;
00803                 emit newResult( result );
00804             }
00805         }
00806 
00807         results.insert( result.resourceUri(), result );
00808     }
00809 
00810     return results;
00811 }
00812 
00813 
00814 QHash<QUrl, Nepomuk::Search::Result> Nepomuk::Search::SearchThread::luceneQuery( const QString& query, double baseScore, bool reportResults )
00815 {
00816     QString finalQuery( query );
00817 
00818     // if Soprano is 2.1.64 or newer the storage service does force the indexing or rdf:type which means that
00819     // we can query it via lucene queries
00820     // normally for completeness we would have to exclude all the owl and nrl properties but that would make
00821     // for way to long queries and this should cover most cases anyway
00822     // since we do not have inference we even need to check subclasses
00823 #if SOPRANO_IS_VERSION(2,1,64)
00824     finalQuery += QString(" AND NOT %1:%2 AND NOT %1:%3 AND NOT %1:%4 AND NOT %1:%5 AND NOT %1:%6 AND NOT %1:%7")
00825                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::type()) )
00826                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDF::Property()) )
00827                   .arg( luceneQueryEscape(Soprano::Vocabulary::RDFS::Class()) )
00828                   .arg( luceneQueryEscape(Soprano::Vocabulary::OWL::Class()) )
00829                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::InstanceBase()) )
00830                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::Ontology()) )
00831                   .arg( luceneQueryEscape(Soprano::Vocabulary::NRL::KnowledgeBase()) );
00832 #endif
00833 
00834     kDebug() << finalQuery;
00835 
00836     Soprano::QueryResultIterator hits = ResourceManager::instance()->mainModel()->executeQuery( finalQuery,
00837                                                                                                 Soprano::Query::QueryLanguageUser,
00838                                                                                                 "lucene" );
00839     QHash<QUrl, Result> results;
00840 
00841     while ( hits.next() ) {
00842         if ( m_canceled ) break;
00843 
00844         QUrl hitUri = hits.binding( 0 ).uri();
00845         double hitScore = hits.binding( 1 ).literal().toDouble() * baseScore;
00846 
00847         if ( hitScore >= cutOffScore() ) {
00848             Result result( hitUri, hitScore );
00849 
00850             if ( !m_searchTerm.requestProperties().isEmpty() ) {
00851                 // FIXME: when merging with results from sparqlQuery there is no need to fetch them twice!
00852                 fetchRequestPropertiesForResource( result );
00853             }
00854 
00855             // these are actual direct hits and we can report them right away
00856             if ( reportResults ) {
00857                 if ( m_searchTerm.limit() > 0 && m_numResults >= m_searchTerm.limit() ) {
00858                     return results;
00859                 }
00860                 else {
00861                     ++m_numResults;
00862                     kDebug() << "direct hit:" << hitUri << hitScore;
00863                     emit newResult( result );
00864                 }
00865             }
00866 
00867             results.insert( hitUri, result );
00868         }
00869         else {
00870             kDebug() << "Score too low:" << hitUri << hitScore;
00871         }
00872     }
00873 
00874     return results;
00875 }
00876 
00877 
00878 QString Nepomuk::Search::SearchThread::buildRequestPropertyVariableList() const
00879 {
00880     int numRequestProperties = m_searchTerm.requestProperties().count();
00881     QString s;
00882     for ( int i = 1; i <= numRequestProperties; ++i ) {
00883         s += QString( "?reqProp%1 " ).arg( i );
00884     }
00885     return s;
00886 }
00887 
00888 
00889 QString Nepomuk::Search::SearchThread::buildRequestPropertyPatterns() const
00890 {
00891     QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
00892     QString s;
00893     int i = 1;
00894     foreach ( const Query::RequestProperty& rp, requestProperties ) {
00895         if ( rp.second ) {
00896             s += "OPTIONAL { ";
00897         }
00898 
00899         s += QString( "?r <%1> ?reqProp%2 . " ).arg( QString::fromAscii( rp.first.toEncoded() ) ).arg( i++ );
00900 
00901         if ( rp.second ) {
00902             s += "} ";
00903         }
00904     }
00905     return s;
00906 }
00907 
00908 
00909 Nepomuk::Search::Result Nepomuk::Search::SearchThread::extractResult( const Soprano::QueryResultIterator& it ) const
00910 {
00911     Result result( it.binding( 0 ).uri() );
00912 
00913     int i = 1;
00914     QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
00915     foreach ( const Query::RequestProperty& rp, requestProperties ) {
00916         result.addRequestProperty( rp.first, it.binding( QString("reqProp%1").arg( i++ ) ) );
00917     }
00918 
00919     // score will be set above
00920     return result;
00921 }
00922 
00923 
00924 void Nepomuk::Search::SearchThread::fetchRequestPropertiesForResource( Result& result )
00925 {
00926     QString q = QString( "select distinct %1 where { %2 }" )
00927                 .arg( buildRequestPropertyVariableList() )
00928                 .arg( buildRequestPropertyPatterns().replace( "?r ", '<' + QString::fromAscii( result.resourceUri().toEncoded() ) + "> " ) );
00929     kDebug() << q;
00930     Soprano::QueryResultIterator reqPropHits = ResourceManager::instance()->mainModel()->executeQuery( q, Soprano::Query::QueryLanguageSparql );
00931     if ( reqPropHits.next() ) {
00932         int i = 1;
00933         QList<Query::RequestProperty> requestProperties = m_searchTerm.requestProperties();
00934         foreach ( const Query::RequestProperty& rp, requestProperties ) {
00935             result.addRequestProperty( rp.first, reqPropHits.binding( QString("reqProp%1").arg( i++ ) ) );
00936         }
00937     }
00938 }
00939 
00940 #include "searchthread.moc"

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal