00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "sopranoindexreader.h"
00021 #include "tstring.h"
00022 #include <strigi/query.h>
00023 #include <strigi/queryparser.h>
00024 #include <strigi/fieldtypes.h>
00025 #include "util.h"
00026
00027 #include <Soprano/Soprano>
00028 #include <Soprano/Index/IndexFilterModel>
00029 #include <Soprano/Index/CLuceneIndex>
00030 #include <Soprano/Vocabulary/XMLSchema>
00031
00032 #include <map>
00033 #include <utility>
00034 #include <sstream>
00035
00036 #include <CLucene.h>
00037
00038 #include <QtCore/QThread>
00039 #include <QtCore/QDateTime>
00040 #include <QtCore/QDebug>
00041 #include <QtCore/QString>
00042 #include <QtCore/QLatin1String>
00043 #include <QtCore/QFile>
00044
00045
00046 using namespace Soprano;
00047
00048
00049 static lucene::index::Term* createWildCardTerm( const TString& name,
00050 const string& value );
00051 static lucene::index::Term* createTerm( const TString& name,
00052 const string& value );
00053 static lucene::index::Term* createKeywordTerm( const TString& name,
00054 const string& value );
00055 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
00056 static lucene::search::Query* createQuery( const Strigi::Query& query );
00057 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
00058 static lucene::search::Query* createSingleFieldQuery( const string& field,
00059 const Strigi::Query& query );
00060 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
00061
00062 #if 0
00063 static QString luceneQueryEscape( const QString& s )
00064 {
00065
00066
00067 static QRegExp rx( "([\\-" + QRegExp::escape( "+&|!(){}[]^\"~:\\" ) + "])" );
00068 QString es( s );
00069 es.replace( rx, "\\\\1" );
00070 return es;
00071 }
00072 #endif
00073
00074 static lucene::index::Term* createWildCardTerm( const TString& name,
00075 const string& value )
00076 {
00077 TString v = TString::fromUtf8( value.c_str() );
00078 return _CLNEW lucene::index::Term( name.data(), v.data() );
00079 }
00080
00081 static lucene::index::Term* createTerm( const TString& name,
00082 const string& value )
00083 {
00084 qDebug() << "createTerm" << name << value.c_str();
00085
00086 TString v = TString::fromUtf8( value.c_str() );
00087
00088 lucene::util::StringReader sr( v.data() );
00089 lucene::analysis::standard::StandardAnalyzer a;
00090 lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
00091 lucene::analysis::Token* to = ts->next();
00092 const wchar_t *tv;
00093 if (to) {
00094 tv = to->termText();
00095 } else {
00096 tv = v.data();
00097 }
00098 lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
00099 if (to) {
00100 _CLDELETE(to);
00101 }
00102 _CLDELETE(ts);
00103 return t;
00104 }
00105
00106 static lucene::index::Term* createKeywordTerm( const TString& name,
00107 const string& value )
00108 {
00109 TString v = TString::fromUtf8( value.c_str() );
00110 lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
00111 return t;
00112 }
00113
00114 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
00115 {
00116 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00117 bool isAnd = query.type() == Strigi::Query::And;
00118 const vector<Strigi::Query>& sub = query.subQueries();
00119 for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
00120 lucene::search::Query* q = createQuery(*i);
00121 bq->add(q, true, isAnd, i->negate());
00122 }
00123 return bq;
00124 }
00125
00126 static lucene::search::Query* createQuery( const Strigi::Query& query )
00127 {
00128 return query.subQueries().size()
00129 ? createBooleanQuery(query)
00130 : createSimpleQuery(query);
00131 }
00132
00133 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
00134 {
00135 switch (query.fields().size()) {
00136 case 0: return createSingleFieldQuery("text", query);
00137 case 1: return createSingleFieldQuery(query.fields()[0], query);
00138 default: return createMultiFieldQuery(query);
00139 }
00140 }
00141
00142 static lucene::search::Query* createSingleFieldQuery( const string& field,
00143 const Strigi::Query& query ) {
00144 qDebug() << "Creating single field query: " << field.c_str();
00145 TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
00146 lucene::search::Query* q;
00147 lucene::index::Term* t;
00148 const string& val = query.term().string();
00149 switch (query.type()) {
00150 case Strigi::Query::LessThan:
00151 t = createTerm(fieldname, val.c_str());
00152 q = _CLNEW lucene::search::RangeQuery(0, t, false);
00153 break;
00154 case Strigi::Query::LessThanEquals:
00155 t = createTerm(fieldname, query.term().string());
00156 q = _CLNEW lucene::search::RangeQuery(0, t, true);
00157 break;
00158 case Strigi::Query::GreaterThan:
00159 t = createTerm(fieldname, query.term().string());
00160 q = _CLNEW lucene::search::RangeQuery(t, 0, false);
00161 break;
00162 case Strigi::Query::GreaterThanEquals:
00163 t = createTerm(fieldname, query.term().string());
00164 q = _CLNEW lucene::search::RangeQuery(t, 0, true);
00165 break;
00166 case Strigi::Query::Keyword:
00167 t = createKeywordTerm(fieldname, query.term().string());
00168 q = _CLNEW lucene::search::TermQuery(t);
00169 break;
00170 default:
00171 if (strpbrk(val.c_str(), "*?")) {
00172 t = createWildCardTerm(fieldname, val);
00173 q = _CLNEW lucene::search::WildcardQuery(t);
00174 } else {
00175 t = createTerm(fieldname, val);
00176 q = _CLNEW lucene::search::TermQuery(t);
00177 }
00178 }
00179 _CLDECDELETE(t);
00180 return q;
00181 }
00182
00183 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
00184 {
00185 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00186 for (vector<string>::const_iterator i = query.fields().begin();
00187 i != query.fields().end(); ++i) {
00188 lucene::search::Query* q = createSingleFieldQuery(*i, query);
00189 bq->add(q, true, false, false);
00190 }
00191 return bq;
00192 }
00193
00194
00195 static QString escapeLiteralForSparqlQuery( const QString& s )
00196 {
00197 return QString( s ).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
00198 }
00199
00200
00201 class Strigi::Soprano::IndexReader::Private
00202 {
00203 public:
00204 bool createDocument( const Node& res, IndexedDocument& doc ) {
00205 StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
00206 if ( it.lastError() ) {
00207 return false;
00208 }
00209
00210
00211 doc.uri = res.uri().toLocalFile().toUtf8().data();
00212
00213 while ( it.next() ) {
00214 Statement s = *it;
00215 if ( s.object().isLiteral() ) {
00216 std::string fieldName = Util::fieldName( s.predicate().uri() );
00217 std::string value = s.object().toString().toUtf8().data();
00218
00219 if (fieldName == "text") {
00220 doc.fragment = value;
00221 }
00222 else if (fieldName == FieldRegister::pathFieldName) {
00223 qDebug() << "Setting IndexedDocument uri=" << value.c_str();
00224 doc.uri = value;
00225 }
00226 else if (fieldName == FieldRegister::mimetypeFieldName) {
00227 doc.mimetype = value;
00228 }
00229 else if (fieldName == FieldRegister::mtimeFieldName) {
00230
00231 if ( s.object().literal().isDateTime() ) {
00232 doc.mtime = s.object().literal().toDateTime().toTime_t();
00233 }
00234 else {
00235 doc.mtime = s.object().literal().toUnsignedInt();
00236 }
00237 }
00238 else if (fieldName == FieldRegister::sizeFieldName) {
00239 doc.size = s.object().literal().toInt64();
00240 }
00241 else {
00242 doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
00243 }
00244 }
00245 else {
00246
00247
00248 }
00249 }
00250
00251 return true;
00252 }
00253
00254
00255 ::Soprano::Model* repository;
00256 };
00257
00258
00259 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
00260 : Strigi::IndexReader()
00261 {
00262 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
00263 d = new Private;
00264 d->repository = model;
00265 }
00266
00267
00268 Strigi::Soprano::IndexReader::~IndexReader()
00269 {
00270 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
00271 delete d;
00272 }
00273
00274
00275 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
00276 {
00277 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
00278
00279 lucene::search::Query* q = createQuery( query );
00280 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
00281 ::Soprano::Query::QueryLanguageUser,
00282 QLatin1String( "lucene" ) );
00283
00284 int s = 0;
00285 while ( hits.next() ) {
00286 qDebug() << "Query hit:" << hits.binding( 0 );
00287 ++s;
00288 }
00289 _CLDELETE(q);
00290 return s;
00291 }
00292
00293
00294 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
00295 const std::vector<std::string>& fields,
00296 const std::vector<Strigi::Variant::Type>& types,
00297 std::vector<std::vector<Strigi::Variant> >& result,
00298 int off, int max )
00299 {
00300 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
00301 lucene::search::Query* bq = createQuery( query );
00302 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00303 ::Soprano::Query::QueryLanguageUser,
00304 QLatin1String( "lucene" ) );
00305
00306
00307 int i = -1;
00308 while ( hits.next() ) {
00309 ++i;
00310 if ( i < off ) {
00311 continue;
00312 }
00313 if ( i > max ) {
00314 break;
00315 }
00316
00317
00318 std::vector<Strigi::Variant> resultRow;
00319 std::vector<std::string>::const_iterator fieldIt = fields.begin();
00320 std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
00321 while ( fieldIt != fields.end() ) {
00322 if ( typesIt == types.end() ) {
00323 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
00324 return;
00325 }
00326
00327 StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
00328 Util::fieldUri( *fieldIt ),
00329 Node() ) );
00330
00331 if ( it.next() ) {
00332 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
00333 }
00334 else {
00335 resultRow.push_back( Strigi::Variant() );
00336 }
00337
00338 ++fieldIt;
00339 ++typesIt;
00340 }
00341
00342 result.push_back( resultRow );
00343 }
00344 _CLDELETE(bq);
00345 }
00346
00347
00348 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
00349 {
00350 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
00351 vector<IndexedDocument> results;
00352 lucene::search::Query* bq = createQuery( query );
00353 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00354 ::Soprano::Query::QueryLanguageUser,
00355 QLatin1String( "lucene" ) );
00356
00357
00358 int i = -1;
00359 while ( hits.next() ) {
00360 ++i;
00361 if ( i < off ) {
00362 continue;
00363 }
00364 if ( i > max ) {
00365 break;
00366 }
00367
00368 IndexedDocument result;
00369
00370 result.score = hits.binding( 1 ).literal().toDouble();
00371 if ( d->createDocument( hits.binding( 0 ), result ) ) {
00372 results.push_back( result );
00373 }
00374 else {
00375 qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
00376 }
00377 }
00378 _CLDELETE(bq);
00379 return results;
00380 }
00381
00382
00383
00384 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
00385 std::map<std::string, time_t>& children )
00386 {
00387
00388 QString query = QString( "select distinct ?path ?mtime where { "
00389 "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
00390 "?r <%4> ?mtime . "
00391 "?r <%5> ?path . "
00392 "}")
00393 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString(),
00394 escapeLiteralForSparqlQuery( QString::fromUtf8( parent.c_str() ) ),
00395 Vocabulary::XMLSchema::string().toString(),
00396 Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
00397 Util::fieldUri( FieldRegister::pathFieldName ).toString(),
00398 Node( QUrl::fromLocalFile( QFile::decodeName( parent.c_str() ) ) ).toN3() );
00399
00400
00401
00402 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
00403
00404 while ( result.next() ) {
00405 Node pathNode = result.binding( "path" );
00406 Node mTimeNode = result.binding( "mtime" );
00407
00408
00409
00410 std::string path;
00411 if ( pathNode.isLiteral() )
00412 path = pathNode.toString().toUtf8().data();
00413 else
00414 path = QFile::encodeName( pathNode.uri().toLocalFile() ).data();
00415
00416
00417 if ( mTimeNode.literal().isDateTime() ) {
00418 children[path] = mTimeNode.literal().toDateTime().toTime_t();
00419 }
00420 else {
00421 children[path] = mTimeNode.literal().toUnsignedInt();
00422 }
00423 }
00424 }
00425
00426
00427 int32_t Strigi::Soprano::IndexReader::countDocuments()
00428 {
00429 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
00430
00431 return 0;
00432 }
00433
00434
00435 int32_t Strigi::Soprano::IndexReader::countWords()
00436 {
00437 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
00438
00439 return -1;
00440 }
00441
00442
00443 int64_t Strigi::Soprano::IndexReader::indexSize()
00444 {
00445 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
00446 return d->repository->statementCount();
00447 }
00448
00449
00450 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
00451 {
00452
00453 QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
00454 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString(),
00455 Util::fieldUri( FieldRegister::pathFieldName ).toString(),
00456 escapeLiteralForSparqlQuery( QString::fromUtf8( uri.c_str() ) ),
00457 Vocabulary::XMLSchema::string().toString() );
00458
00459 qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
00460
00461 QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
00462
00463 time_t mtime = 0;
00464 if ( it.next() ) {
00465 ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
00466
00467
00468 if ( val.isDateTime() ) {
00469 mtime = val.toDateTime().toTime_t();
00470 }
00471 else {
00472 mtime = val.toUnsignedInt();
00473 }
00474 }
00475 return mtime;
00476 }
00477
00478
00479 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
00480 {
00481 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
00482
00483
00484
00485 std::vector<std::string> fields;
00486 QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QueryLanguageSparql );
00487 while ( it.next() ) {
00488 fields.push_back( Util::fieldName( it.binding("p").uri() ) );
00489 }
00490 return fields;
00491 }
00492
00493
00494 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
00495 const std::string& fieldname,
00496 const std::string& labeltype )
00497 {
00498 Q_UNUSED(query);
00499 Q_UNUSED(fieldname);
00500 Q_UNUSED(labeltype);
00501
00502
00503 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
00504
00505 return std::vector<std::pair<std::string,uint32_t> >();
00506 }
00507
00508
00509 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
00510 const std::vector<std::string>& fieldnames)
00511 {
00512 Q_UNUSED(keywordprefix);
00513 Q_UNUSED(fieldnames);
00514
00515 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
00516
00517 return 2;
00518 }
00519
00520
00521 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
00522 const std::vector<std::string>& fieldnames,
00523 uint32_t max, uint32_t offset )
00524 {
00525 Q_UNUSED(keywordmatch);
00526 Q_UNUSED(fieldnames);
00527 Q_UNUSED(max);
00528 Q_UNUSED(offset);
00529
00530 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
00531
00532 return std::vector<std::string>();
00533 }