upgrade
This commit is contained in:
333
main/inc/lib/search/xapian/XapianIndexer.class.php
Normal file
333
main/inc/lib/search/xapian/XapianIndexer.class.php
Normal file
@@ -0,0 +1,333 @@
|
||||
<?php
|
||||
/* For licensing terms, see /license.txt */
|
||||
|
||||
/**
|
||||
* @package chamilo.include.search
|
||||
*/
|
||||
require_once 'xapian.php';
|
||||
|
||||
/**
|
||||
* Abstract helper class.
|
||||
*
|
||||
* @package chamilo.include.search
|
||||
*/
|
||||
abstract class XapianIndexer
|
||||
{
|
||||
/* XapianTermGenerator */
|
||||
public $indexer;
|
||||
/* XapianStem */
|
||||
public $stemmer;
|
||||
/* XapianWritableDatabase */
|
||||
protected $db;
|
||||
/* IndexableChunk[] */
|
||||
protected $chunks;
|
||||
|
||||
/**
|
||||
* Class contructor.
|
||||
*/
|
||||
public function __construct()
|
||||
{
|
||||
$this->db = null;
|
||||
$this->stemmer = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class destructor.
|
||||
*/
|
||||
public function __destruct()
|
||||
{
|
||||
unset($this->db);
|
||||
unset($this->stemmer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a list of languages Xapian manages.
|
||||
*
|
||||
* This method enables the definition of more matches between
|
||||
* Chamilo languages and Xapian languages (through hardcoding)
|
||||
*
|
||||
* @return array Array of languages codes -> Xapian languages
|
||||
*/
|
||||
final public function xapian_languages()
|
||||
{
|
||||
/* http://xapian.org/docs/apidoc/html/classXapian_1_1Stem.html */
|
||||
return [
|
||||
'none' => 'none', //don't stem terms
|
||||
'da' => 'danish',
|
||||
'nl' => 'dutch',
|
||||
/* Martin Porter's 2002 revision of his stemmer */
|
||||
'en' => 'english',
|
||||
/* Lovin's stemmer */
|
||||
'lovins' => 'english_lovins',
|
||||
/* Porter's stemmer as described in his 1980 paper */
|
||||
'porter' => 'english_porter',
|
||||
'fi' => 'finnish',
|
||||
'fr' => 'french',
|
||||
'de' => 'german',
|
||||
'it' => 'italian',
|
||||
'no' => 'norwegian',
|
||||
'pt' => 'portuguese',
|
||||
'ru' => 'russian',
|
||||
'es' => 'spanish',
|
||||
'sv' => 'swedish',
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to the database, and create it if it doesn't exist.
|
||||
*/
|
||||
public function connectDb($path = null, $dbMode = null, $lang = 'english')
|
||||
{
|
||||
if ($this->db != null) {
|
||||
return $this->db;
|
||||
}
|
||||
if ($dbMode == null) {
|
||||
$dbMode = Xapian::DB_CREATE_OR_OPEN;
|
||||
}
|
||||
|
||||
if ($path == null) {
|
||||
$path = api_get_path(SYS_UPLOAD_PATH).'plugins/xapian/searchdb/';
|
||||
}
|
||||
|
||||
try {
|
||||
$this->db = new XapianWritableDatabase($path, $dbMode);
|
||||
$this->indexer = new XapianTermGenerator();
|
||||
|
||||
if (!in_array($lang, $this->xapian_languages())) {
|
||||
$lang = 'english';
|
||||
}
|
||||
$this->stemmer = new XapianStem($lang);
|
||||
$this->indexer->set_stemmer($this->stemmer);
|
||||
|
||||
return $this->db;
|
||||
} catch (Exception $e) {
|
||||
echo Display::return_message($e->getMessage(), 'error');
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple getter for the db attribute.
|
||||
*
|
||||
* @return object The db attribute
|
||||
*/
|
||||
public function getDb()
|
||||
{
|
||||
return $this->db;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add this chunk to the chunk array attribute.
|
||||
*
|
||||
* @param string Chunk of text
|
||||
*/
|
||||
public function addChunk($chunk)
|
||||
{
|
||||
$this->chunks[] = $chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually index the current data.
|
||||
*
|
||||
* @return int New Xapian document ID or null upon failure
|
||||
*/
|
||||
public function index()
|
||||
{
|
||||
try {
|
||||
if (!empty($this->chunks)) {
|
||||
foreach ($this->chunks as $chunk) {
|
||||
$doc = new XapianDocument();
|
||||
$this->indexer->set_document($doc);
|
||||
if (!empty($chunk->terms)) {
|
||||
foreach ($chunk->terms as $term) {
|
||||
/* FIXME: think of getting weight */
|
||||
$doc->add_term($term['flag'].$term['name'], 1);
|
||||
}
|
||||
}
|
||||
|
||||
// free-form index all data array (title, content, etc)
|
||||
if (!empty($chunk->data)) {
|
||||
foreach ($chunk->data as $key => $value) {
|
||||
$this->indexer->index_text($value, 1);
|
||||
}
|
||||
}
|
||||
$doc->set_data($chunk->xapian_data, 1);
|
||||
$did = $this->db->add_document($doc);
|
||||
|
||||
//write to disk
|
||||
$this->db->flush();
|
||||
|
||||
return $did;
|
||||
}
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
echo Display::return_message($e->getMessage(), 'error');
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a specific document from xapian db.
|
||||
*
|
||||
* @param int did Xapian::docid
|
||||
*
|
||||
* @return mixed XapianDocument, or false on error
|
||||
*/
|
||||
public function get_document($did)
|
||||
{
|
||||
if ($this->db == null) {
|
||||
$this->connectDb();
|
||||
}
|
||||
try {
|
||||
$docid = $this->db->get_document($did);
|
||||
} catch (Exception $e) {
|
||||
//echo Display::return_message($e->getMessage(), 'error');
|
||||
return false;
|
||||
}
|
||||
|
||||
return $docid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get document data on a xapian document.
|
||||
*
|
||||
* @param XapianDocument $doc xapian document to push into the db
|
||||
*
|
||||
* @return mixed xapian document data or FALSE if error
|
||||
*/
|
||||
public function get_document_data($doc)
|
||||
{
|
||||
if ($this->db == null) {
|
||||
$this->connectDb();
|
||||
}
|
||||
try {
|
||||
if (!is_a($doc, 'XapianDocument')) {
|
||||
return false;
|
||||
}
|
||||
$doc_data = $doc->get_data();
|
||||
|
||||
return $doc_data;
|
||||
} catch (Exception $e) {
|
||||
//echo Display::return_message($e->getMessage(), 'error');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace all terms of a document in xapian db.
|
||||
*
|
||||
* @param int $did Xapian::docid
|
||||
* @param array $terms New terms of the document
|
||||
* @param string $prefix Prefix used to categorize the doc
|
||||
* (usually 'T' for title, 'A' for author)
|
||||
*
|
||||
* @return bool false on error
|
||||
*/
|
||||
public function update_terms($did, $terms, $prefix)
|
||||
{
|
||||
$doc = $this->get_document($did);
|
||||
if ($doc === false) {
|
||||
return false;
|
||||
}
|
||||
$doc->clear_terms();
|
||||
foreach ($terms as $term) {
|
||||
//add directly
|
||||
$doc->add_term($prefix.$term, 1);
|
||||
}
|
||||
$this->db->replace_document($did, $doc);
|
||||
$this->db->flush();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a document from xapian db.
|
||||
*
|
||||
* @param int did Xapian::docid
|
||||
*/
|
||||
public function remove_document($did)
|
||||
{
|
||||
if ($this->db == null) {
|
||||
$this->connectDb();
|
||||
}
|
||||
$did = (int) $did;
|
||||
if ($did > 0) {
|
||||
$doc = $this->get_document($did);
|
||||
if ($doc !== false) {
|
||||
$this->db->delete_document($did);
|
||||
$this->db->flush();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a term to the document specified.
|
||||
*
|
||||
* @param string $term The term to add
|
||||
* @param XapianDocument $doc The xapian document where to add the term
|
||||
*
|
||||
* @return mixed XapianDocument, or false on error
|
||||
*/
|
||||
public function add_term_to_doc($term, $doc)
|
||||
{
|
||||
if (!is_a($doc, 'XapianDocument')) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
$doc->add_term($term);
|
||||
} catch (Exception $e) {
|
||||
echo Display::return_message($e->getMessage(), 'error');
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a term from the document specified.
|
||||
*
|
||||
* @param string $term The term to add
|
||||
* @param XapianDocument $doc The xapian document where to add the term
|
||||
*
|
||||
* @return mixed XapianDocument, or false on error
|
||||
*/
|
||||
public function remove_term_from_doc($term, $doc)
|
||||
{
|
||||
if (!is_a($doc, 'XapianDocument')) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
$doc->remove_term($term);
|
||||
} catch (Exception $e) {
|
||||
echo Display::return_message($e->getMessage(), 'error');
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace a document in the actual db.
|
||||
*
|
||||
* @param XapianDocument $doc xapian document to push into the db
|
||||
* @param int $did xapian document id of the document to replace
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public function replace_document($doc, $did)
|
||||
{
|
||||
if (!is_a($doc, 'XapianDocument')) {
|
||||
return false;
|
||||
}
|
||||
if ($this->db == null) {
|
||||
$this->connectDb();
|
||||
}
|
||||
try {
|
||||
$this->getDb()->replace_document((int) $did, $doc);
|
||||
$this->getDb()->flush();
|
||||
} catch (Exception $e) {
|
||||
echo Display::return_message($e->getMessage(), 'error');
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
274
main/inc/lib/search/xapian/XapianQuery.php
Normal file
274
main/inc/lib/search/xapian/XapianQuery.php
Normal file
@@ -0,0 +1,274 @@
|
||||
<?php
|
||||
|
||||
/* For licensing terms, see /license.txt */
|
||||
/**
|
||||
* @package chamilo.include.search
|
||||
*/
|
||||
require_once 'xapian.php';
|
||||
//TODO: think another way without including specific fields here
|
||||
require_once api_get_path(LIBRARY_PATH).'specific_fields_manager.lib.php';
|
||||
define('XAPIAN_DB', api_get_path(SYS_UPLOAD_PATH).'plugins/xapian/searchdb/');
|
||||
|
||||
/**
|
||||
* Queries the database.
|
||||
* The xapian_query function queries the database using both a query string
|
||||
* and application-defined terms. Based on drupal-xapian.
|
||||
*
|
||||
* @param string $query_string The search string. This string will
|
||||
* be parsed and stemmed automatically.
|
||||
* @param XapianDatabase $db Xapian database to connect
|
||||
* @param int $start An integer defining the first
|
||||
* document to return
|
||||
* @param int $length the number of results to return
|
||||
* @param array $extra an array containing arrays of
|
||||
* extra terms to search for
|
||||
* @param int $count_type Number of items to retrieve
|
||||
*
|
||||
* @return array an array of nids corresponding to the results
|
||||
*/
|
||||
function xapian_query($query_string, $db = null, $start = 0, $length = 10, $extra = [], $count_type = 0)
|
||||
{
|
||||
try {
|
||||
if (!is_object($db)) {
|
||||
$db = new XapianDatabase(XAPIAN_DB);
|
||||
}
|
||||
|
||||
// Build subqueries from $extra array. Now only used by tags search filter on search widget
|
||||
$subqueries = [];
|
||||
foreach ($extra as $subquery) {
|
||||
if (!empty($subquery)) {
|
||||
$subqueries[] = new XapianQuery($subquery);
|
||||
}
|
||||
}
|
||||
|
||||
$query = null;
|
||||
$enquire = new XapianEnquire($db);
|
||||
if (!empty($query_string)) {
|
||||
$query_parser = new XapianQueryParser();
|
||||
//TODO: choose stemmer
|
||||
$stemmer = new XapianStem("english");
|
||||
$query_parser->set_stemmer($stemmer);
|
||||
$query_parser->set_database($db);
|
||||
$query_parser->set_stemming_strategy(XapianQueryParser::STEM_SOME);
|
||||
$query_parser->add_boolean_prefix('courseid', XAPIAN_PREFIX_COURSEID);
|
||||
$query_parser->add_boolean_prefix('toolid', XAPIAN_PREFIX_TOOLID);
|
||||
$query = $query_parser->parse_query($query_string);
|
||||
$final_array = array_merge($subqueries, [$query]);
|
||||
$query = new XapianQuery(XapianQuery::OP_AND, $final_array);
|
||||
} else {
|
||||
$query = new XapianQuery(XapianQuery::OP_OR, $subqueries);
|
||||
}
|
||||
|
||||
$enquire->set_query($query);
|
||||
|
||||
$matches = $enquire->get_mset((int) $start, (int) $length);
|
||||
|
||||
$specific_fields = get_specific_field_list();
|
||||
|
||||
$results = [];
|
||||
$i = $matches->begin();
|
||||
|
||||
// Display the results.
|
||||
//echo $matches->get_matches_estimated().'results found';
|
||||
|
||||
$count = 0;
|
||||
|
||||
while (!$i->equals($matches->end())) {
|
||||
$count++;
|
||||
$document = $i->get_document();
|
||||
|
||||
if (is_object($document)) {
|
||||
// process one item terms
|
||||
$courseid_terms = xapian_get_doc_terms($document, XAPIAN_PREFIX_COURSEID);
|
||||
$results[$count]['courseid'] = substr($courseid_terms[0]['name'], 1);
|
||||
$toolid_terms = xapian_get_doc_terms($document, XAPIAN_PREFIX_TOOLID);
|
||||
$results[$count]['toolid'] = substr($toolid_terms[0]['name'], 1);
|
||||
|
||||
// process each specific field prefix
|
||||
foreach ($specific_fields as $specific_field) {
|
||||
$results[$count]['sf-'.$specific_field['code']] = xapian_get_doc_terms($document, $specific_field['code']);
|
||||
}
|
||||
|
||||
// rest of data
|
||||
$results[$count]['xapian_data'] = unserialize($document->get_data());
|
||||
$results[$count]['score'] = ($i->get_percent());
|
||||
}
|
||||
$i->next();
|
||||
}
|
||||
|
||||
switch ($count_type) {
|
||||
case 1: // Lower bound
|
||||
$count = $matches->get_matches_lower_bound();
|
||||
break;
|
||||
|
||||
case 2: // Upper bound
|
||||
$count = $matches->get_matches_upper_bound();
|
||||
break;
|
||||
|
||||
case 0: // Best estimate
|
||||
default:
|
||||
$count = $matches->get_matches_estimated();
|
||||
break;
|
||||
}
|
||||
|
||||
return [$count, $results];
|
||||
} catch (Exception $e) {
|
||||
display_xapian_error($e->getMessage());
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* build a boolean query.
|
||||
*/
|
||||
function xapian_get_boolean_query($term)
|
||||
{
|
||||
return new XapianQuery($term);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve a list db terms.
|
||||
*
|
||||
* @param int $count Number of terms to retrieve
|
||||
* @param char $prefix The prefix of the term to retrieve
|
||||
* @param XapianDatabase $db Xapian database to connect
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
function xapian_get_all_terms($count = 0, $prefix, $db = null)
|
||||
{
|
||||
try {
|
||||
if (!is_object($db)) {
|
||||
$db = new XapianDatabase(XAPIAN_DB);
|
||||
}
|
||||
|
||||
if (!empty($prefix)) {
|
||||
$termi = $db->allterms_begin($prefix);
|
||||
} else {
|
||||
$termi = $db->allterms_begin();
|
||||
}
|
||||
|
||||
$terms = [];
|
||||
$i = 0;
|
||||
for (; !$termi->equals($db->allterms_end()) && (++$i <= $count || $count == 0); $termi->next()) {
|
||||
$terms[] = [
|
||||
'frequency' => $termi->get_termfreq(),
|
||||
'name' => $termi->get_term(),
|
||||
];
|
||||
}
|
||||
|
||||
return $terms;
|
||||
} catch (Exception $e) {
|
||||
display_xapian_error($e->getMessage());
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve all terms of a document.
|
||||
*
|
||||
* @param XapianDocument document searched
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
function xapian_get_doc_terms($doc = null, $prefix)
|
||||
{
|
||||
try {
|
||||
if (!is_a($doc, 'XapianDocument')) {
|
||||
return;
|
||||
}
|
||||
|
||||
//TODO: make the filter by prefix on xapian if possible
|
||||
//ojwb marvil07: use Document::termlist_begin() and then skip_to(prefix) on the TermIterator
|
||||
//ojwb you'll need to check the end condition by hand though
|
||||
$terms = [];
|
||||
for ($termi = $doc->termlist_begin(); !$termi->equals($doc->termlist_end()); $termi->next()) {
|
||||
$term = [
|
||||
'frequency' => $termi->get_termfreq(),
|
||||
'name' => $termi->get_term(),
|
||||
];
|
||||
if ($term['name'][0] === $prefix) {
|
||||
$terms[] = $term;
|
||||
}
|
||||
}
|
||||
|
||||
return $terms;
|
||||
} catch (Exception $e) {
|
||||
display_xapian_error($e->getMessage());
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Join xapian queries.
|
||||
*
|
||||
* @param XapianQuery|array $query1
|
||||
* @param XapianQuery|array $query2
|
||||
* @param string $op
|
||||
*
|
||||
* @return XapianQuery query joined
|
||||
*/
|
||||
function xapian_join_queries($query1, $query2 = null, $op = 'or')
|
||||
{
|
||||
// let decide how to join, avoiding include xapian.php outside
|
||||
switch ($op) {
|
||||
case 'or':
|
||||
$op = XapianQuery::OP_OR;
|
||||
break;
|
||||
case 'and':
|
||||
$op = XapianQuery::OP_AND;
|
||||
break;
|
||||
default:
|
||||
$op = XapianQuery::OP_OR;
|
||||
break;
|
||||
}
|
||||
|
||||
// review parameters to decide how to join
|
||||
if (!is_array($query1)) {
|
||||
$query1 = [$query1];
|
||||
}
|
||||
if (is_null($query2)) {
|
||||
// join an array of queries with $op
|
||||
return new XapianQuery($op, $query1);
|
||||
}
|
||||
if (!is_array($query2)) {
|
||||
$query2 = [$query2];
|
||||
}
|
||||
|
||||
return new XapianQuery($op, array_merge($query1, $query2));
|
||||
}
|
||||
|
||||
/**
|
||||
* @author Isaac flores paz <florespaz@bidsoftperu.com>
|
||||
*
|
||||
* @param string The xapian error message
|
||||
*
|
||||
* @return string The chamilo error message
|
||||
*/
|
||||
function display_xapian_error($xapian_error_message)
|
||||
{
|
||||
$message = explode(':', $xapian_error_message);
|
||||
$type_error_message = $message[0];
|
||||
if ($type_error_message == 'DatabaseOpeningError') {
|
||||
$message_error = get_lang('SearchDatabaseOpeningError');
|
||||
} elseif ($type_error_message == 'DatabaseVersionError') {
|
||||
$message_error = get_lang('SearchDatabaseVersionError');
|
||||
} elseif ($type_error_message == 'DatabaseModifiedError') {
|
||||
$message_error = get_lang('SearchDatabaseModifiedError');
|
||||
} elseif ($type_error_message == 'DatabaseLockError') {
|
||||
$message_error = get_lang('SearchDatabaseLockError');
|
||||
} elseif ($type_error_message == 'DatabaseCreateError') {
|
||||
$message_error = get_lang('SearchDatabaseCreateError');
|
||||
} elseif ($type_error_message == 'DatabaseCorruptError') {
|
||||
$message_error = get_lang('SearchDatabaseCorruptError');
|
||||
} elseif ($type_error_message == 'NetworkTimeoutError') {
|
||||
$message_error = get_lang('SearchNetworkTimeoutError');
|
||||
} else {
|
||||
$message_error = get_lang('SearchOtherXapianError');
|
||||
}
|
||||
$display_message = get_lang('Error').' : '.$message_error;
|
||||
echo Display::return_message($display_message, 'error');
|
||||
}
|
||||
6
main/inc/lib/search/xapian/index.html
Normal file
6
main/inc/lib/search/xapian/index.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user