• Skip to content
  • Skip to link menu
KDE 4.2 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

sopranoindexwriter.cpp

Go to the documentation of this file.
00001 /*
00002    Copyright (C) 2007-2008 Sebastian Trueg <trueg@kde.org>
00003 
00004    This library is free software; you can redistribute it and/or
00005    modify it under the terms of the GNU General Public License as
00006    published by the Free Software Foundation; either version 2 of
00007    the License, or (at your option) any later version.
00008 
00009    This library is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012    Library General Public License for more details.
00013 
00014    You should have received a copy of the GNU General Public License
00015    along with this library; see the file COPYING.  If not, write to
00016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017    Boston, MA 02110-1301, USA.
00018  */
00019 
00020 #include "sopranoindexwriter.h"
00021 #include "util.h"
00022 
00023 #include <Soprano/Soprano>
00024 #include <Soprano/Vocabulary/RDF>
00025 #include <Soprano/Vocabulary/Xesam>
00026 #include <Soprano/LiteralValue>
00027 
00028 #include <QtCore/QList>
00029 #include <QtCore/QHash>
00030 #include <QtCore/QVariant>
00031 #include <QtCore/QFileInfo>
00032 #include <QtCore/QFile>
00033 #include <QtCore/QUrl>
00034 #include <QtCore/QDebug>
00035 #include <QtCore/QThread>
00036 #include <QtCore/QDateTime>
00037 #include <QtCore/QByteArray>
00038 #include <QtCore/QUuid>
00039 
00040 #include <KUrl>
00041 
00042 #include <sys/stat.h>
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <errno.h>
00046 
00047 #include <map>
00048 #include <sstream>
00049 #include <algorithm>
00050 
00051 
00052 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
00053 
00054 using namespace Soprano;
00055 
00056 
00057 uint qHash( const std::string& s )
00058 {
00059     return qHash( s.c_str() );
00060 }
00061 
00062 namespace {
00063     QString findArchivePath( const QString& path ) {
00064         QString p( path );
00065         int i = 0;
00066         while ( ( i = p.lastIndexOf( '/' ) ) > 0 ) {
00067             p.truncate( i );
00068             if ( QFileInfo( p ).isFile() ) {
00069                 return p;
00070             }
00071         }
00072         return QString();
00073     }
00074 
00075     QUrl createResourceUri( const Strigi::AnalysisResult* idx ) {
00076         // HACK: Strigi includes analysers that recurse into tar or zip archives and index
00077         // the files therein. In KDE these files could perfectly be handled through kio slaves
00078         // such as tar:/ or zip:/
00079         // Here we try to use KDE-compatible URIs for these indexed files the best we can
00080         // everything else defaults to file:/
00081         QUrl uri;
00082         QString path = QFile::decodeName( idx->path().c_str() );
00083         if ( KUrl::isRelativeUrl( path ) )
00084             uri = QUrl::fromLocalFile( QFileInfo( path ).absoluteFilePath() );
00085         else
00086             uri = KUrl( path ); // try to support http and other URLs
00087 
00088         if ( idx->depth() > 0 ) {
00089             QString archivePath = findArchivePath( path );
00090             if ( QFile::exists( archivePath ) ) {
00091                 if ( archivePath.endsWith( QLatin1String( ".tar" ) ) ||
00092                      archivePath.endsWith( QLatin1String( ".tar.gz" ) ) ||
00093                      archivePath.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
00094                     uri.setScheme( "tar" );
00095                 }
00096                 else if ( archivePath.endsWith( QLatin1String( ".zip" ) ) ) {
00097                     uri.setScheme( "zip" );
00098                 }
00099             }
00100         }
00101 
00102         // fallback for all
00103         if ( uri.scheme().isEmpty() ) {
00104             uri.setScheme( "file" );
00105         }
00106 
00107         return uri;
00108     }
00109 
00110     QUrl createGraphUri() {
00111         return QUrl( "urn:nepomuk:local:" + QUuid::createUuid().toString().remove( QRegExp( "[\\{\\}]" ) ) );
00112     }
00113 
00114     class FileMetaData
00115     {
00116     public:
00117         // caching URIs for little speed improvement
00118         QUrl fileUri;
00119         QUrl context;
00120         std::string content;
00121     };
00122 
00123     class RegisteredFieldData
00124     {
00125     public:
00126         RegisteredFieldData( const QUrl& prop, QVariant::Type t )
00127             : property( prop ),
00128               dataType( t ),
00129               isRdfType( prop == Vocabulary::RDF::type() ) {
00130         }
00131 
00132         QUrl property;
00133         QVariant::Type dataType;
00134         bool isRdfType;
00135     };
00136 }
00137 
00138 
00139 class Strigi::Soprano::IndexWriter::Private
00140 {
00141 public:
00142     Private()
00143         : indexTransactionID( 0 ) {
00144         literalTypes[FieldRegister::stringType] = QVariant::String;
00145         literalTypes[FieldRegister::floatType] = QVariant::Double;
00146         literalTypes[FieldRegister::integerType] = QVariant::Int;
00147         literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
00148         literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
00149     }
00150 
00151     QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
00152         // it looks as if the typeUri can contain arbitrary values, URIs or stuff like "string"
00153         QHash<std::string, QVariant::Type>::const_iterator it = literalTypes.constFind( strigiType.typeUri() );
00154         if ( it == literalTypes.constEnd() ) {
00155             return LiteralValue::typeFromDataTypeUri( QUrl::fromEncoded( strigiType.typeUri().c_str() ) );
00156         }
00157         else {
00158             return *it;
00159         }
00160     }
00161 
00162     LiteralValue createLiteralValue( QVariant::Type type,
00163                                      const unsigned char* data,
00164                                      uint32_t size ) {
00165         QString value = QString::fromUtf8( ( const char* )data, size );
00166         if ( type == QVariant::DateTime ) { // dataTime is stored as integer in strigi!
00167             return LiteralValue( QDateTime::fromTime_t( value.toUInt() ) );
00168         }
00169         else if ( type != QVariant::Invalid ) {
00170             return LiteralValue::fromString( value, type );
00171         }
00172         else {
00173             // we default to string
00174             return LiteralValue( value );
00175         }
00176     }
00177 
00178     ::Soprano::Model* repository;
00179     int indexTransactionID;
00180 
00181 private:
00182     QHash<std::string, QVariant::Type> literalTypes;
00183 };
00184 
00185 
00186 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model* model )
00187     : Strigi::IndexWriter()
00188 {
00189 //    qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
00190     d = new Private;
00191     d->repository = model;
00192     Util::storeStrigiMiniOntology( d->repository );
00193 //    qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
00194 }
00195 
00196 
00197 Strigi::Soprano::IndexWriter::~IndexWriter()
00198 {
00199     delete d;
00200 }
00201 
00202 
00203 void Strigi::Soprano::IndexWriter::commit()
00204 {
00205 }
00206 
00207 
00208 // delete all indexed data for the files listed in entries
00209 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector<std::string>& entries )
00210 {
00211 //    qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
00212 
00213     QString systemLocationUri = Util::fieldUri( FieldRegister::pathFieldName ).toString();
00214     for ( unsigned int i = 0; i < entries.size(); ++i ) {
00215         QString path = QString::fromUtf8( entries[i].c_str() );
00216         QString query = QString( "select ?g ?mg where { "
00217                                  "{ { ?r <%1> \"%2\"^^<%3> . } UNION { ?r <%1> %6 . } } . "
00218                                  "?g <%4> ?r . "
00219                                  "OPTIONAL { ?mg <%5> ?g . } }" )
00220                         .arg( systemLocationUri )
00221                         .arg( path )
00222                         .arg( Vocabulary::XMLSchema::string().toString() )
00223                         .arg( Strigi::Ontology::indexGraphFor().toString() )
00224                         .arg( Vocabulary::NRL::coreGraphMetadataFor().toString() )
00225                         .arg( Node( QUrl::fromLocalFile( path ) ).toN3() );
00226 
00227         qDebug() << "deleteEntries query:" << query;
00228 
00229         QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QueryLanguageSparql );
00230         if ( result.next() ) {
00231             Node indexGraph = result.binding( "g" );
00232             Node metaDataGraph = result.binding( "mg" );
00233 
00234             result.close();
00235 
00236             // delete the indexed data
00237             d->repository->removeContext( indexGraph );
00238 
00239             // delete the metadata (backwards compatible)
00240             if ( metaDataGraph.isValid() )
00241                 d->repository->removeContext( metaDataGraph );
00242             else
00243                 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00244         }
00245     }
00246 }
00247 
00248 
00249 void Strigi::Soprano::IndexWriter::deleteAllEntries()
00250 {
00251 //    qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
00252 
00253     // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
00254     QString query = QString( "select ?g where { ?g <%1> ?r . }" ).arg( Strigi::Ontology::indexGraphFor().toString() );
00255 
00256     qDebug() << "deleteAllEntries query:" << query;
00257 
00258     QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00259     QList<Node> allIndexGraphs = result.iterateBindings( "g" ).allNodes();
00260     for ( QList<Node>::const_iterator it = allIndexGraphs.constBegin(); it != allIndexGraphs.constEnd(); ++it ) {
00261         Node indexGraph = *it;
00262 
00263         qDebug() << "Found indexGraph to delete:" << indexGraph;
00264 
00265         // delete the indexed data
00266         d->repository->removeContext( indexGraph );
00267 
00268         // delete the metadata
00269         d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00270     }
00271 }
00272 
00273 
00274 // called for each indexed file
00275 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult* idx )
00276 {
00277     if ( idx->depth() > 0 ) {
00278         return;
00279     }
00280 
00281 //    qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
00282     FileMetaData* data = new FileMetaData();
00283     data->fileUri = createResourceUri( idx );
00284 
00285     // let's check if we already have data on the file
00286     StatementIterator it = d->repository->listStatements( Node(),
00287                                                           Strigi::Ontology::indexGraphFor(),
00288                                                           data->fileUri );
00289     if ( it.next() ) {
00290         data->context = it.current().subject().uri();
00291     }
00292     else {
00293         data->context = createGraphUri();
00294     }
00295 
00296 //    qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
00297 
00298     idx->setWriterData( data );
00299 }
00300 
00301 
00302 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult* idx, const char* text, int32_t length )
00303 {
00304     if ( idx->depth() > 0 ) {
00305         return;
00306     }
00307 
00308     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00309     md->content.append( text, length );
00310 }
00311 
00312 
00313 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00314                                              const RegisteredField* field,
00315                                              const std::string& value )
00316 {
00317     if ( idx->depth() > 0 ) {
00318         return;
00319     }
00320 
00321 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00322     if ( value.length() > 0 ) {
00323         FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00324         RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00325 
00326         // Strigi uses rdf:type improperly since it stores the value as a string. We have to
00327         // make sure it is a resource.
00328         if ( rfd->isRdfType ) {
00329             d->repository->addStatement( md->fileUri,
00330                                          ::Soprano::Vocabulary::RDF::type(),
00331                                          QUrl::fromEncoded( value.c_str(), QUrl::StrictMode ),
00332                                          md->context );
00333         }
00334         else {
00335             // we bend the plain strigi properties into something nicer, also because we do not want paths to be indexed, way too many false positives
00336             // in standard desktop searches
00337             if ( field->key() == FieldRegister::pathFieldName ||
00338                  field->key() == FieldRegister::parentLocationFieldName ) {
00339                 d->repository->addStatement( md->fileUri,
00340                                              rfd->property,
00341                                              QUrl::fromLocalFile( QFile::decodeName( QByteArray::fromRawData( value.c_str(), value.length() ) ) ),
00342                                              md->context );
00343             }
00344             else {
00345                 d->repository->addStatement( Statement( md->fileUri,
00346                                                         rfd->property,
00347                                                         d->createLiteralValue( rfd->dataType, ( unsigned char* )value.c_str(), value.length() ),
00348                                                         md->context) );
00349             }
00350         }
00351         if ( d->repository->lastError() )
00352             qDebug() << "Failed to add value" << value.c_str();
00353     }
00354 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00355 }
00356 
00357 
00358 // the main addValue method
00359 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00360                                              const RegisteredField* field,
00361                                              const unsigned char* data,
00362                                              uint32_t size )
00363 {
00364     addValue( idx, field, std::string( ( const char* )data, size ) );
00365 }
00366 
00367 
00368 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult*, const RegisteredField*,
00369                                              const std::string&, const std::string& )
00370 {
00371     // we do not support map types
00372 }
00373 
00374 
00375 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00376                                              const RegisteredField* field,
00377                                              uint32_t value )
00378 {
00379     if ( idx->depth() > 0 ) {
00380         return;
00381     }
00382 
00383 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00384     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00385     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00386 
00387     LiteralValue val( value );
00388     if ( field->type() == FieldRegister::datetimeType ) {
00389         val = QDateTime::fromTime_t( value );
00390     }
00391 
00392     d->repository->addStatement( Statement( md->fileUri,
00393                                             rfd->property,
00394                                             val,
00395                                             md->context) );
00396 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00397 }
00398 
00399 
00400 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00401                                              const RegisteredField* field,
00402                                              int32_t value )
00403 {
00404     if ( idx->depth() > 0 ) {
00405         return;
00406     }
00407 
00408 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00409     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00410     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00411 
00412     d->repository->addStatement( Statement( md->fileUri,
00413                                             rfd->property,
00414                                             LiteralValue( value ),
00415                                             md->context) );
00416 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00417 }
00418 
00419 
00420 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00421                                              const RegisteredField* field,
00422                                              double value )
00423 {
00424     if ( idx->depth() > 0 ) {
00425         return;
00426     }
00427 
00428 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00429     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00430     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00431 
00432     d->repository->addStatement( Statement( md->fileUri,
00433                                             rfd->property,
00434                                             LiteralValue( value ),
00435                                             md->context) );
00436 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00437 }
00438 
00439 
00440 void Strigi::Soprano::IndexWriter::addTriplet( const std::string& subject,
00441                                                const std::string& predicate, const std::string& object )
00442 {
00443     // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
00444     // next commit()?
00445 
00446     // FIXME: create an NRL metadata graph
00447     d->repository->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject.c_str() ) ) ),
00448                                             Node( QUrl( QString::fromUtf8( predicate.c_str() ) ) ),
00449                                             Node( QUrl( QString::fromUtf8( object.c_str() ) ) ),
00450                                             Node() ) );
00451 }
00452 
00453 
00454 // called after each indexed file
00455 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult* idx )
00456 {
00457     if ( idx->depth() > 0 ) {
00458         return;
00459     }
00460 
00461 //    qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
00462     FileMetaData* md = static_cast<FileMetaData*>( idx->writerData() );
00463 
00464     if ( md->content.length() > 0 ) {
00465         d->repository->addStatement( Statement( md->fileUri,
00466                                                 Vocabulary::Xesam::asText(),
00467                                                 LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
00468                                                 md->context ) );
00469         if ( d->repository->lastError() )
00470             qDebug() << "Failed to add" << md->fileUri << "as text" << QString::fromUtf8( md->content.c_str() );
00471     }
00472 
00473     // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
00474     // Thus, here we go the easy way and mark each indexed file as a xesam:File.
00475     if ( QFileInfo( QFile::decodeName( idx->path().c_str() ) ).isDir() )
00476         d->repository->addStatement( Statement( md->fileUri,
00477                                                 Vocabulary::RDF::type(),
00478                                                 Vocabulary::Xesam::Folder(),
00479                                                 md->context ) );
00480     else
00481         d->repository->addStatement( Statement( md->fileUri,
00482                                                 Vocabulary::RDF::type(),
00483                                                 Vocabulary::Xesam::File(),
00484                                                 md->context ) );
00485 
00486 
00487     // create the provedance data for the data graph
00488     // TODO: add more data at some point when it becomes of interest
00489     QUrl metaDataContext = md->context.toString() + "-metadata";
00490     d->repository->addStatement( Statement( md->context,
00491                                             Vocabulary::RDF::type(),
00492                                             Vocabulary::NRL::InstanceBase(),
00493                                             metaDataContext ) );
00494     d->repository->addStatement( Statement( md->context,
00495                                             Vocabulary::NAO::created(),
00496                                             LiteralValue( QDateTime::currentDateTime() ),
00497                                             metaDataContext ) );
00498     d->repository->addStatement( Statement( md->context,
00499                                             Strigi::Ontology::indexGraphFor(),
00500                                             md->fileUri,
00501                                             metaDataContext ) );
00502     d->repository->addStatement( Statement( metaDataContext,
00503                                             Vocabulary::RDF::type(),
00504                                             Vocabulary::NRL::GraphMetadata(),
00505                                             metaDataContext ) );
00506     d->repository->addStatement( metaDataContext,
00507                                  Vocabulary::NRL::coreGraphMetadataFor(),
00508                                  md->context,
00509                                  metaDataContext );
00510 
00511     // cleanup
00512     delete md;
00513     idx->setWriterData( 0 );
00514 
00515 //    qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();
00516 }
00517 
00518 
00519 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister& f )
00520 {
00521     map<string, RegisteredField*>::const_iterator i;
00522     map<string, RegisteredField*>::const_iterator end = f.fields().end();
00523     for (i = f.fields().begin(); i != end; ++i) {
00524         QUrl prop = Util::fieldUri( i->second->key() );
00525         i->second->setWriterData( new RegisteredFieldData( prop,
00526                                                            prop == Vocabulary::RDF::type()
00527                                                            ? QVariant::Invalid
00528                                                            : d->literalType( i->second->properties() ) ) );
00529     }
00530 }
00531 
00532 
00533 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister& f )
00534 {
00535     map<string, RegisteredField*>::const_iterator i;
00536     map<string, RegisteredField*>::const_iterator end = f.fields().end();
00537     for (i = f.fields().begin(); i != end; ++i) {
00538         delete static_cast<RegisteredFieldData*>( i->second->writerData() );
00539         i->second->setWriterData( 0 );
00540     }
00541 }

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.7
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal