diff options
Diffstat (limited to 'MLEB/Translate/ttmserver')
-rw-r--r-- | MLEB/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php | 110 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/DatabaseTTMServer.php | 18 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/ElasticSearchTTMServer.php | 258 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/FakeTTMServer.php | 5 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/FuzzyLikeThis.php | 1 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/Interfaces.php | 16 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/SolrTTMServer.php | 445 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/TTMServer.php | 34 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/TTMServerMessageUpdateJob.php | 6 | ||||
-rw-r--r-- | MLEB/Translate/ttmserver/schema.xml | 45 |
10 files changed, 268 insertions, 670 deletions
diff --git a/MLEB/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php b/MLEB/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php index ba620e40..6a6d0226 100644 --- a/MLEB/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php +++ b/MLEB/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php @@ -4,13 +4,13 @@ * @since 2015.08 */ class CrossLanguageTranslationSearchQuery { - /** @var TTMServer */ + /** @var SearchableTTMServer */ protected $server; /** @var array */ protected $params; - /** @var ResultSet */ + /** @var \Elastica\ResultSet */ protected $resultset; /** @var int */ @@ -25,56 +25,74 @@ class CrossLanguageTranslationSearchQuery { public function getDocuments() { $documents = []; - $total = $start = 0; - $queryString = $this->params['query']; $offset = $this->params['offset']; $limit = $this->params['limit']; - $size = 1000; $options = $this->params; - $options['limit'] = $size; $options['language'] = $this->params['sourcelanguage']; - do { - $options['offset'] = $start; - $this->resultset = $this->server->search( $queryString, $options, $this->hl ); - - list( $results, $offsets ) = $this->extractMessages( - $this->resultset, - $offset, - $limit - ); - $offset = $offsets['start'] + $offsets['left'] - $offsets['total']; - $limit = $limit - $offsets['left']; - $total = $total + $offsets['total']; + // Use a bigger limit that what was requested, since we are likely to throw away many + // results in the local filtering step at extractMessages + $options['limit'] = $limit * 10; + // TODO: the real offset should be communicated to the frontend. It currently assumes + // next offset is current offset + limit and previous one is current offset - limit. + // It might be difficult to fix scrolling results backwards. For now we handle offset + // locally. + $options['offset'] = 0; + + // @phan-suppress-next-line PhanUndeclaredMethod + $search = $this->server->createSearch( $this->params['query'], $options, $this->hl ); + $scroll = $search->scroll( '5s' ); + + // Used for aggregations. Only the first scroll response has them. + $this->resultset = null; + + foreach ( $scroll as $resultSet ) { + if ( !$this->resultset ) { + $this->resultset = $resultSet; + $this->total = $resultSet->getTotalHits(); + } + $results = $this->extractMessages( $resultSet->getDocuments() ); $documents = array_merge( $documents, $results ); - $start = $start + $size; - } while ( - $offsets['start'] + $offsets['left'] >= $offsets['total'] && - $this->resultset->getTotalHits() > $start - ); - $this->total = $total; + + $count = count( $documents ); + + if ( $count >= $offset + $limit ) { + break; + } + } + + if ( !$this->resultset ) { + // No hits for documents, just set the result set. + $this->resultset = $scroll->current(); + $this->total = $scroll->current()->getTotalHits(); + } + + // clear was introduced in Elastica 5.3.1, but Elastica extension uses 5.3.0 + if ( is_callable( [ $scroll, 'clear' ] ) ) { + $scroll->clear(); + } + $documents = array_slice( $documents, $offset, $limit ); return $documents; } /** - * Extract messages from the resultset and build message definitions. + * Extract messages from the documents and build message definitions. * Create a message collection from the definitions in the target language. * Filter the message collection to get filtered messages. * Slice messages according to limit and offset given. - * @param ResultSet $resultset - * @param int $offset - * @param int $limit - * @return array + * @param \Elastica\Document[] $documents + * @return array[] */ - protected function extractMessages( $resultset, $offset, $limit ) { - $messages = $documents = $ret = []; + protected function extractMessages( $documents ) { + $messages = $ret = []; $language = $this->params['language']; - foreach ( $resultset->getResults() as $document ) { + foreach ( $documents as $document ) { $data = $document->getData(); + // @phan-suppress-next-line PhanUndeclaredMethod if ( !$this->server->isLocalSuggestion( $data ) ) { continue; } @@ -103,32 +121,26 @@ class CrossLanguageTranslationSearchQuery { $collection->filter( $filter, false ); } - $total = count( $collection ); - $offset = $collection->slice( $offset, $limit ); - $left = count( $collection ); - - $offsets = [ - 'start' => $offset[2], - 'left' => $left, - 'total' => $total, - ]; - if ( $filter === 'translated' || $filter === 'fuzzy' ) { $collection->loadTranslations(); } - foreach ( $collection->keys() as $mkey => $title ) { - $documents[$mkey]['content'] = $messages[$mkey]; + foreach ( $collection->keys() as $mkey => $titleValue ) { + $title = Title::newFromLinkTarget( $titleValue ); + + $result = []; + $result['content'] = $messages[$mkey]; if ( $filter === 'translated' || $filter === 'fuzzy' ) { - $documents[$mkey]['content'] = $collection[$mkey]->translation(); + $result['content'] = $collection[$mkey]->translation(); } $handle = new MessageHandle( $title ); - $documents[$mkey]['localid'] = $handle->getTitleForBase()->getPrefixedText(); - $documents[$mkey]['language'] = $language; - $ret[] = $documents[$mkey]; + $result['localid'] = $handle->getTitleForBase()->getPrefixedText(); + $result['language'] = $language; + + $ret[] = $result; } - return [ $ret, $offsets ]; + return $ret; } /** diff --git a/MLEB/Translate/ttmserver/DatabaseTTMServer.php b/MLEB/Translate/ttmserver/DatabaseTTMServer.php index 74d2360d..c172c34a 100644 --- a/MLEB/Translate/ttmserver/DatabaseTTMServer.php +++ b/MLEB/Translate/ttmserver/DatabaseTTMServer.php @@ -9,6 +9,7 @@ * @ingroup TTMServer */ +use MediaWiki\MediaWikiServices; use Wikimedia\Rdbms\DBQueryError; /** @@ -147,9 +148,10 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable $dbw->delete( 'translate_tms', '*', __METHOD__ ); $dbw->delete( 'translate_tmt', '*', __METHOD__ ); $dbw->delete( 'translate_tmf', '*', __METHOD__ ); + // @phan-suppress-next-line PhanUndeclaredMethod $table = $dbw->tableName( 'translate_tmf' ); try { - $dbw->query( "DROP INDEX tmf_text ON $table" ); + $dbw->query( "DROP INDEX tmf_text ON $table", __METHOD__ ); } catch ( DBQueryError $e ) { // Perhaps the script was aborted before it got // chance to add the index back. @@ -167,7 +169,8 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); $this->sids[$key] = $this->insertSource( $context, $language, $text ); } - wfWaitForSlaves( 10 ); + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + $lbFactory->waitForReplication( [ 'ifWritesSince' => 10 ] ); } public function batchInsertTranslations( array $batch ) { @@ -183,7 +186,8 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable $dbw = $this->getDB( DB_MASTER ); $dbw->insert( 'translate_tmt', $rows, __METHOD__ ); - wfWaitForSlaves( 10 ); + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + $lbFactory->waitForReplication( [ 'ifWritesSince' => 10 ] ); } public function endBatch() { @@ -191,8 +195,9 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable public function endBootstrap() { $dbw = $this->getDB( DB_MASTER ); + // @phan-suppress-next-line PhanUndeclaredMethod $table = $dbw->tableName( 'translate_tmf' ); - $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)" ); + $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)", __METHOD__ ); } /* Reading interface */ @@ -271,7 +276,7 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable 'context' => $row->tms_context, 'location' => $row->tms_context . '/' . $targetLanguage, 'quality' => $quality, - 'wiki' => isset( $row->tms_wiki ) ? $row->tms_wiki : wfWikiID(), + 'wiki' => $row->tms_wiki ?? wfWikiID(), ]; } } @@ -279,4 +284,7 @@ class DatabaseTTMServer extends TTMServer implements WritableTTMServer, Readable return $results; } + + public function setDoReIndex() { + } } diff --git a/MLEB/Translate/ttmserver/ElasticSearchTTMServer.php b/MLEB/Translate/ttmserver/ElasticSearchTTMServer.php index a1a01c0e..1a98c8cc 100644 --- a/MLEB/Translate/ttmserver/ElasticSearchTTMServer.php +++ b/MLEB/Translate/ttmserver/ElasticSearchTTMServer.php @@ -17,39 +17,33 @@ use MediaWiki\Logger\LoggerFactory; */ class ElasticSearchTTMServer extends TTMServer - implements ReadableTTMServer, WritableTTMServer, SearchableTTMserver + implements ReadableTTMServer, WritableTTMServer, SearchableTTMServer { /** - * @const int number of documents that will be loaded and deleted in a - * single operation - */ - const BULK_DELETE_CHUNK_SIZE = 100; - - /** * @const int in case a write operation fails during a batch process * this constant controls the number of times we will retry the same * operation. */ - const BULK_INDEX_RETRY_ATTEMPTS = 5; + private const BULK_INDEX_RETRY_ATTEMPTS = 5; /** * @const int time (seconds) to wait for the index to ready before * starting to index. Since we wait for index status it can be relatively * long especially if some nodes are restarted. */ - const WAIT_UNTIL_READY_TIMEOUT = 3600; + private const WAIT_UNTIL_READY_TIMEOUT = 3600; /** * Flag in the frozen index that indicates that all indices * are frozen (useful only when this service shares the cluster with * CirrusSearch) */ - const ALL_INDEXES_FROZEN_NAME = 'freeze_everything'; + protected const ALL_INDEXES_FROZEN_NAME = 'freeze_everything'; /** * Type used in the frozen index */ - const FROZEN_TYPE = 'frozen'; + protected const FROZEN_TYPE = 'frozen'; /** * @var \Elastica\Client @@ -101,36 +95,13 @@ class ElasticSearchTTMServer $fuzzyQuery->addFields( [ 'content' ] ); $boostQuery = new \Elastica\Query\FunctionScore(); - if ( $this->useWikimediaExtraPlugin() ) { - $boostQuery->addFunction( - 'levenshtein_distance_score', - [ - 'text' => $text, - 'field' => 'content' - ] - ); - } else { - // TODO: should we remove this code block the extra - // plugin is now mandatory and we will never use the - // groovy script. - if ( $this->isElastica5() ) { - $scriptClass = \Elastica\Script\Script::class; - } else { - $scriptClass = \Elastica\Script::class; - } - - $groovyScript = -<<<GROOVY -import org.apache.lucene.search.spell.* -new LevensteinDistance().getDistance(srctxt, _source['content']) -GROOVY; - $script = new $scriptClass( - $groovyScript, - [ 'srctxt' => $text ], - $scriptClass::LANG_GROOVY - ); - $boostQuery->addScriptScoreFunction( $script ); - } + $boostQuery->addFunction( + 'levenshtein_distance_score', + [ + 'text' => $text, + 'field' => 'content' + ] + ); $boostQuery->setBoostMode( \Elastica\Query\FunctionScore::BOOST_MODE_REPLACE ); // Wrap the fuzzy query so it can be used as a filter. @@ -161,7 +132,7 @@ GROOVY; $query->setFrom( 0 ); $query->setSize( $sizeFirst ); $query->setParam( '_source', [ 'content' ] ); - $cutoff = isset( $this->config['cutoff'] ) ? $this->config['cutoff'] : 0.65; + $cutoff = $this->config['cutoff'] ?? 0.65; $query->setParam( 'min_score', $cutoff ); $query->setSort( [ '_score', '_uid' ] ); @@ -204,6 +175,7 @@ GROOVY; } // After the first query, the smallest score is the new threshold. + // @phan-suppress-next-line PhanPossiblyUndeclaredVariable $query->setParam( 'min_score', $score ); $query->setFrom( $query->getParam( 'size' ) + $query->getParam( 'from' ) ); $query->setSize( $sizeSecond ); @@ -258,6 +230,14 @@ GROOVY; /* Write functions */ + /** + * Add / update translations. + * + * @param MessageHandle $handle + * @param ?string $targetText + * @throws \RuntimeException + * @return bool + */ public function update( MessageHandle $handle, $targetText ) { if ( !$handle->isValid() || $handle->getCode() === '' ) { return false; @@ -278,14 +258,11 @@ GROOVY; // Do not delete definitions, because the translations are attached to that if ( $handle->getCode() !== $sourceLanguage ) { $localid = $handle->getTitleForBase()->getPrefixedText(); - - $boolQuery = new \Elastica\Query\BoolQuery(); - $boolQuery->addFilter( new Elastica\Query\Term( [ 'wiki' => wfWikiID() ] ) ); - $boolQuery->addFilter( new Elastica\Query\Term( [ 'language' => $handle->getCode() ] ) ); - $boolQuery->addFilter( new Elastica\Query\Term( [ 'localid' => $localid ] ) ); - - $query = new \Elastica\Query( $boolQuery ); - $this->deleteByQuery( $this->getType(), $query ); + $this->deleteByQuery( $this->getType(), Elastica\Query::create( + ( new \Elastica\Query\BoolQuery() ) + ->addFilter( new Elastica\Query\Term( [ 'wiki' => wfWikiID() ] ) ) + ->addFilter( new Elastica\Query\Term( [ 'language' => $handle->getCode() ] ) ) + ->addFilter( new Elastica\Query\Term( [ 'localid' => $localid ] ) ) ) ); } // If translation was made fuzzy, we do not need to add anything @@ -376,6 +353,11 @@ GROOVY; $type->getIndex()->create( $indexSettings, $rebuild ); } + /** + * Begin the bootstrap process. + * + * @throws \RuntimeException + */ public function beginBootstrap() { $type = $this->getType(); if ( $this->updateMapping ) { @@ -388,43 +370,32 @@ GROOVY; $settings = $type->getIndex()->getSettings(); $settings->setRefreshInterval( '-1' ); - $term = new Elastica\Query\Term(); - $term->setTerm( 'wiki', wfWikiID() ); - $query = new \Elastica\Query( $term ); - $this->deleteByQuery( $type, $query ); + $this->deleteByQuery( $this->getType(), \Elastica\Query::create( + ( new Elastica\Query\Term() )->setTerm( 'wiki', wfWikiID() ) ) ); $mapping = new \Elastica\Type\Mapping(); $mapping->setType( $type ); - - $keywordType = [ 'type' => 'string', 'index' => 'not_analyzed' ]; - $textType = 'string'; - if ( $this->isElastica5() ) { - $keywordType = [ 'type' => 'keyword' ]; - $textType = 'text'; - } $mapping->setProperties( [ - 'wiki' => $keywordType, - 'localid' => $keywordType, - 'uri' => $keywordType, - 'language' => $keywordType, - 'group' => $keywordType, + 'wiki' => [ 'type' => 'keyword' ], + 'localid' => [ 'type' => 'keyword' ], + 'uri' => [ 'type' => 'keyword' ], + 'language' => [ 'type' => 'keyword' ], + 'group' => [ 'type' => 'keyword' ], 'content' => [ - 'type' => $textType, + 'type' => 'text', 'fields' => [ 'content' => [ - 'type' => $textType, - 'index' => 'analyzed', + 'type' => 'text', 'term_vector' => 'yes' ], 'prefix_complete' => [ - 'type' => $textType, + 'type' => 'text', 'analyzer' => 'prefix', 'search_analyzer' => 'standard', 'term_vector' => 'yes' ], 'case_sensitive' => [ - 'type' => $textType, - 'index' => 'analyzed', + 'type' => 'text', 'analyzer' => 'casesensitive', 'term_vector' => 'yes' ] @@ -440,6 +411,10 @@ GROOVY; // I hate the rule that forbids {} } + /** + * @param array[] $batch + * @phan-param array<int,array{0:MessageHandle,1:string,2:string}> $batch + */ public function batchInsertDefinitions( array $batch ) { $lb = new LinkBatch(); foreach ( $batch as $data ) { @@ -478,11 +453,7 @@ GROOVY; public function endBootstrap() { $index = $this->getType()->getIndex(); $index->refresh(); - if ( $this->isElastica5() ) { - $index->forcemerge(); - } else { - $index->optimize(); - } + $index->forcemerge(); $index->getSettings()->setRefreshInterval( '5s' ); } @@ -508,11 +479,7 @@ GROOVY; * @return string */ private function getIndexName() { - if ( isset( $this->config['index'] ) ) { - return $this->config['index']; - } else { - return 'ttmserver'; - } + return $this->config['index'] ?? 'ttmserver'; } public function getType() { @@ -522,15 +489,17 @@ GROOVY; } protected function getShardCount() { - return isset( $this->config['shards'] ) ? $this->config['shards'] : 1; + return $this->config['shards'] ?? 1; } protected function getReplicaCount() { - return isset( $this->config['replicas'] ) ? $this->config['replicas'] : '0-2'; + return $this->config['replicas'] ?? '0-2'; } /** * Get index health + * TODO: Remove this code in the future as we drop support for + * older versions of the Elastica extension. * * @param string $indexName * @return array the index health status @@ -539,7 +508,7 @@ GROOVY; $path = "_cluster/health/$indexName"; $response = $this->getClient()->request( $path ); if ( $response->hasError() ) { - throw new \Exception( "Error while fetching index health status: ". $response->getError() ); + throw new \Exception( "Error while fetching index health status: " . $response->getError() ); } return $response->getData(); } @@ -551,6 +520,8 @@ GROOVY; * CirrusSearch/includes/Maintenance/ConfigUtils.php. Ideally we'd * like to make these utility methods available in the Elastica * extension, but this one requires some refactoring in cirrus first. + * TODO: Remove this code in the future as we drop support for + * older versions of the Elastica extension. * * @param string $indexName * @param int $timeout @@ -561,7 +532,7 @@ GROOVY; while ( ( $startTime + $timeout ) > time() ) { try { $response = $this->getIndexHealth( $indexName ); - $status = isset( $response['status'] ) ? $response['status'] : 'unknown'; + $status = $response['status'] ?? 'unknown'; if ( $status === 'green' ) { $this->logOutput( "\tGreen!" ); return true; @@ -576,10 +547,29 @@ GROOVY; } protected function waitUntilReady() { + if ( method_exists( MWElasticUtils::class, 'waitForGreen' ) ) { + $statuses = MWElasticUtils::waitForGreen( + $this->getClient(), + $this->getIndexName(), + self::WAIT_UNTIL_READY_TIMEOUT ); + $this->logOutput( "Waiting for the index to go green..." ); + foreach ( $statuses as $message ) { + $this->logOutput( $message ); + } + + if ( !$statuses->getReturn() ) { + die( "Timeout! Please check server logs for {$this->getIndexName()}." ); + } + + return; + } + + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. $indexName = $this->getType()->getIndex()->getName(); $this->logOutput( "Waiting for the index to go green..." ); if ( !$this->waitForGreen( $indexName, self::WAIT_UNTIL_READY_TIMEOUT ) ) { - die( "Timeout! Please check server logs for {$this->getIndex()->getName()}." ); + die( "Timeout! Please check server logs for {$this->getIndexName()}." ); } } @@ -596,9 +586,9 @@ GROOVY; /** * Force the update of index mappings - * @since 2015.03 + * @inheritDoc */ - public function doMappingUpdate() { + public function setDoReIndex() { $this->updateMapping = true; } @@ -678,9 +668,9 @@ GROOVY; * @param string $queryString * @param array $opts * @param array $highlight - * @return array + * @return \Elastica\Search */ - public function search( $queryString, $opts, $highlight ) { + public function createSearch( $queryString, $opts, $highlight ) { $query = new \Elastica\Query(); list( $searchQuery, $highlights ) = $this->parseQueryString( $queryString, $opts ); @@ -723,6 +713,9 @@ GROOVY; // Check that we have at least one filter to avoid invalid query errors. if ( $language !== '' || $group !== '' ) { + // TODO: This seems wrong, but perhaps for aggregation purposes? + // should make $search a must clause and use the bool query + // as main. $query->setPostFilter( $filters ); } @@ -734,15 +727,34 @@ GROOVY; 'fields' => $highlights, ] ); + return $this->getType()->getIndex()->createSearch( $query ); + } + + /** + * Search interface + * @param string $queryString + * @param array $opts + * @param array $highlight + * @throws TTMServerException + * @return \Elastica\ResultSet + */ + public function search( $queryString, $opts, $highlight ) { + $search = $this->createSearch( $queryString, $opts, $highlight ); + try { - return $this->getType()->getIndex()->search( $query ); + return $search->search(); } catch ( \Elastica\Exception\ExceptionInterface $e ) { throw new TTMServerException( $e->getMessage() ); } } + /** + * @param \Elastica\ResultSet $resultset + * @return array + */ public function getFacets( $resultset ) { $aggs = $resultset->getAggregations(); + '@phan-var array[][][] $aggs'; $ret = [ 'language' => [], @@ -758,10 +770,18 @@ GROOVY; return $ret; } + /** + * @param \Elastica\ResultSet $resultset + * @return int + */ public function getTotalHits( $resultset ) { return $resultset->getTotalHits(); } + /** + * @param \Elastica\ResultSet $resultset + * @return array + */ public function getDocuments( $resultset ) { $ret = []; foreach ( $resultset->getResults() as $document ) { @@ -782,13 +802,30 @@ GROOVY; /** * Delete docs by query by using the scroll API. + * TODO: Elastica\Index::deleteByQuery() ? was removed + * in 2.x and returned in 5.x. * * @param \Elastica\Type $type the source index * @param \Elastica\Query $query the query + * @throws \RuntimeException */ private function deleteByQuery( \Elastica\Type $type, \Elastica\Query $query ) { - $retryAttempts = self::BULK_INDEX_RETRY_ATTEMPTS; + if ( method_exists( MWElasticUtils::class, 'deleteByQuery' ) ) { + try { + MWElasticUtils::deleteByQuery( $type->getIndex(), $query, /* $allowConflicts = */ true ); + } catch ( \Exception $e ) { + LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->error( + 'Problem encountered during deletion.', + [ 'exception' => $e ] + ); + throw new \RuntimeException( "Problem encountered during deletion.\n" . $e ); + } + return; + } + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. + $retryAttempts = self::BULK_INDEX_RETRY_ATTEMPTS; $search = new \Elastica\Search( $this->getClient() ); $search->setQuery( $query ); $search->addType( $type ); @@ -817,12 +854,27 @@ GROOVY; * @return bool */ public function isFrozen() { + if ( method_exists( MWElasticUtils::class, 'isFrozen' ) ) { + try { + return MWElasticUtils::isFrozen( $this->getClient() ); + } catch ( \Exception $e ) { + LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning( + 'Problem encountered while checking the frozen index.', + [ 'exception' => $e ] + ); + return false; + } + } + + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. if ( !isset( $this->config['frozen_index'] ) ) { return false; } $frozenIndex = $this->config['frozen_index']; $indices = [ static::ALL_INDEXES_FROZEN_NAME, $this->getIndexName() ]; - $ids = new \Elastica\Query\Ids( null, $indices ); + $ids = ( new \Elastica\Query\Ids() ) + ->setIds( $indices ); try { $resp = $this->getClient() @@ -830,12 +882,8 @@ GROOVY; ->getType( static::FROZEN_TYPE ) ->search( \Elastica\Query::create( $ids ) ); - if ( $resp->count() === 0 ) { - return false; - } else { - return true; - } - } catch ( Exception $e ) { + return $resp->count() !== 0; + } catch ( \Exception $e ) { LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning( 'Problem encountered while checking the frozen index.', [ 'exception' => $e ] @@ -843,14 +891,4 @@ GROOVY; return false; } } - - /** - * @return bool true if running with Elastica 5+ - */ - private function isElastica5() { - // Sadly Elastica does not seem to expose its version so we - // check the inexistence of a class that was removed in the - // version 5 - return !class_exists( \Elastica\Script::class ); - } } diff --git a/MLEB/Translate/ttmserver/FakeTTMServer.php b/MLEB/Translate/ttmserver/FakeTTMServer.php index 16715592..9a35822e 100644 --- a/MLEB/Translate/ttmserver/FakeTTMServer.php +++ b/MLEB/Translate/ttmserver/FakeTTMServer.php @@ -22,7 +22,7 @@ class FakeTTMServer implements ReadableTTMServer, WritableTTMServer { } public function isLocalSuggestion( array $suggestion ) { - false; + return false; } public function expandLocation( array $suggestion ) { @@ -57,4 +57,7 @@ class FakeTTMServer implements ReadableTTMServer, WritableTTMServer { public function isFrozen() { return false; } + + public function setDoReIndex() { + } } diff --git a/MLEB/Translate/ttmserver/FuzzyLikeThis.php b/MLEB/Translate/ttmserver/FuzzyLikeThis.php index 143b3222..82798584 100644 --- a/MLEB/Translate/ttmserver/FuzzyLikeThis.php +++ b/MLEB/Translate/ttmserver/FuzzyLikeThis.php @@ -199,6 +199,7 @@ class FuzzyLikeThis extends \Elastica\Query\AbstractQuery { * @see \Elastica\Query\AbstractQuery::toArray() */ public function toArray() { + $args = []; if ( !empty( $this->_fields ) ) { $args['fields'] = $this->_fields; } diff --git a/MLEB/Translate/ttmserver/Interfaces.php b/MLEB/Translate/ttmserver/Interfaces.php index 1f8cb20e..133f4ae4 100644 --- a/MLEB/Translate/ttmserver/Interfaces.php +++ b/MLEB/Translate/ttmserver/Interfaces.php @@ -33,7 +33,7 @@ interface ReadableTTMServer { * Determines if the suggestion returned by this TTMServer comes * from this wiki or any other wiki. * @param array $suggestion - * @return Bool + * @return bool */ public function isLocalSuggestion( array $suggestion ); @@ -41,7 +41,7 @@ interface ReadableTTMServer { * Given suggestion returned by this TTMServer, constructs fully * qualified URL to the location of the translation. * @param array $suggestion - * @return String URL + * @return string URL */ public function expandLocation( array $suggestion ); } @@ -116,6 +116,12 @@ interface WritableTTMServer { * @return bool true if the service is frozen */ public function isFrozen(); + + /** + * Instruct the service to fully wipe the index and start from scratch. + * @since 2020.01 + */ + public function setDoReIndex(); } /** @@ -135,19 +141,19 @@ interface SearchableTTMServer { public function search( $queryString, $opts, $highlight ); /** - * @param stdClass $resultset + * @param mixed $resultset * @return array[] */ public function getFacets( $resultset ); /** - * @param stdClass $resultset + * @param mixed $resultset * @return int */ public function getTotalHits( $resultset ); /** - * @param stdClass $resultset + * @param mixed $resultset * @return array[] */ public function getDocuments( $resultset ); diff --git a/MLEB/Translate/ttmserver/SolrTTMServer.php b/MLEB/Translate/ttmserver/SolrTTMServer.php deleted file mode 100644 index bb6c244c..00000000 --- a/MLEB/Translate/ttmserver/SolrTTMServer.php +++ /dev/null @@ -1,445 +0,0 @@ -<?php -/** - * TTMServer - The Translate extension translation memory interface - * - * @file - * @author Niklas Laxström - * @copyright Copyright © 2012-2013, Niklas Laxström - * @license GPL-2.0-or-later - * @ingroup TTMServer - */ - -/** - * TTMServer backed based on Solr instance. Depends on Solarium. - * @since 2012-06-27 - * @ingroup TTMServer - * @deprecated 1.27. Will be removed in 1.29. - */ -class SolrTTMServer - extends TTMServer - implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer -{ - /** - * In case auto-commit is not enabled, or even if it is, tell solr to - * commit before this time has passed, in milliseconds. - */ - const COMMIT_WITHIN = 5000; - - protected $client; - - /** - * Reference to the maintenance script to relay logging output. - */ - protected $logger; - - public function __construct( $config ) { - wfDeprecated( __METHOD__, '1.24' ); - - parent::__construct( $config ); - - if ( isset( $config['config'] ) ) { - $this->client = new Solarium_Client( $config['config'] ); - } else { - $this->client = new Solarium_Client(); - } - } - - public function isLocalSuggestion( array $suggestion ) { - return $suggestion['wiki'] === wfWikiID(); - } - - public function expandLocation( array $suggestion ) { - return $suggestion['uri']; - } - - public function query( $sourceLanguage, $targetLanguage, $text ) { - try { - return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); - } catch ( Solarium_Exception $e ) { - throw new TranslationHelperException( 'Solarium exception: ' . $e ); - } - } - - /// @see ReadableTTMServer::query - protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { - /* Two query system: - * 1) Find all strings in source language that match text - * 2) Do another query for translations for those strings - */ - // For now impose a length limit on query string to avoid doing - // very slow queries. Magic number. - if ( strlen( $text ) > 789 ) { - return []; - } - - $query = $this->client->createSelect(); - $query->setFields( [ 'globalid', 'content', 'score' ] ); - - /* The interface usually displays three best candidates. These might - * come from more than three matches, if the translation is the same. - * This might not find all suggestions, if the top N best matching - * source texts don't have translations, but worse matches do. We - * could loop with start parameter to fetch more until we have enough - * suggestions or the quality drops below the cutoff point. */ - $query->setRows( 25 ); - - /* Our string can contain all kind of nasty characters, so we need - * escape them with great pain. */ - $helper = $query->getHelper(); - $dist = $helper->escapePhrase( $text ); - // "edit" could also be ngram of other algorithm - $dist = "strdist($dist,content,edit)"; - /* Note how we need to escape twice here, first the string for strdist - * and then the strdist call itself for the query. And of course every- - * thing will be URL encoded once sent over the line. */ - $query->setQuery( '_val_:%P1%', [ $dist ] ); - - /* Filter queries are supposed to be efficient as they are separately - * cached, but I haven't done any benchmarks. */ - $query->createFilterQuery( 'lang' ) - ->setQuery( 'language:%P1%', [ $sourceLanguage ] ); - - $resultset = $this->client->select( $query ); - - /* This query is doing two unrelated things: - * 1) Collect the message contents and scores so that they can - * be accessed later for the translations we found. - * 2) Build the query string for the query that fetches the - * translations. - * This code is a bit uglier than I'd like it to be, since there - * there is no field that globally identifies a message (message - * definition and translations). */ - $contents = $scores = []; - $queryString = ''; - foreach ( $resultset as $doc ) { - $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); - $contents[$sourceId] = $doc->content; - $scores[$sourceId] = $doc->score; - - $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" ); - $queryString .= "globalid:$globalid "; - } - - // Second query to fetch available translations - $fetchQuery = $this->client->createSelect(); - $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] ); - // This come in random order, so have to fetch all and sort - $fetchQuery->setRows( 25 ); - $fetchQuery->setQuery( $queryString ); - // With AND we would not find anything, obviously. - $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR ); - - $translations = $this->client->select( $fetchQuery ); - - $suggestions = []; - foreach ( $translations as $doc ) { - /* Construct the matching source id */ - $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); - - /* Unfortunately we cannot do this on the search server, - * because score is not a real field and thus cannot be - * used in a filter query. */ - $quality = $scores[$sourceId]; - if ( $quality < $this->config['cutoff'] ) { - continue; - } - - $suggestions[] = [ - 'source' => $contents[$sourceId], - 'target' => $doc->content, - 'context' => $doc->messageid, - 'quality' => $quality, - 'wiki' => $doc->wiki, - 'location' => $doc->messageid . '/' . $targetLanguage, - 'uri' => $doc->uri, - ]; - } - - /* Like mentioned above, we get results in random order. Sort them - * now to have best matches first as expected by callers. */ - uasort( $suggestions, function ( $a, $b ) { - if ( $a['quality'] === $b['quality'] ) { - return 0; - } - - return ( $a['quality'] < $b['quality'] ) ? 1 : -1; - } ); - - return $suggestions; - } - - /* Write functions */ - - public function update( MessageHandle $handle, $targetText ) { - if ( $handle->getCode() === '' ) { - return false; - } - - /* There are various different cases here: - * [new or updated] [fuzzy|non-fuzzy] [translation|definition] - * 1) We don't distinguish between new or updated here. - * 2) Delete old translation, but not definition - * 3) Insert new translation or definition, if non-fuzzy - * The definition should never be fuzzied anyway. - * - * These only apply to known messages. - */ - - $update = $this->client->createUpdate(); - $title = $handle->getTitle(); - - $doDelete = true; - $sourceLanguage = ''; - if ( $handle->isValid() ) { - $sourceLanguage = $handle->getGroup()->getSourceLanguage(); - if ( $handle->getCode() === $sourceLanguage ) { - $doDelete = false; - } - } - - if ( $doDelete ) { - $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() ); - $conds = [ - 'wiki' => wfWikiID(), - 'language' => $handle->getCode(), - 'messageid' => $base->getPrefixedText(), - ]; - foreach ( $conds as $key => &$value ) { - $value = "$key:" . $update->getHelper()->escapePhrase( $value ); - } - $update->addDeleteQuery( implode( ' AND ', $conds ) ); - } - - if ( $targetText !== null ) { - if ( $handle->isValid() ) { - // Of the message definition page - $targetTitle = $handle->getTitle(); - $sourceTitle = Title::makeTitle( - $targetTitle->getNamespace(), - $handle->getKey() . '/' . $sourceLanguage - ); - $revId = (int)$sourceTitle->getLatestRevID(); - /* Note: in some cases the source page might not exist, in this case - * we use 0 as message version identifier, to differentiate them from - * orphan messages */ - } else { - $revId = 'orphan'; - } - - $doc = $this->createDocument( $handle, $targetText, $revId ); - // Add document and commit within X seconds. - $update->addDocument( $doc, null, self::COMMIT_WITHIN ); - } - - try { - $this->client->update( $update ); - } catch ( Solarium_Exception $e ) { - error_log( 'SolrTTMServer update-write failed' ); - - return false; - } - - return true; - } - - /** - * @see schema.xml - * @param MessageHandle $handle - * @param string $text - * @param int $revId - * @return Solarium_Document_ReadWrite - */ - protected function createDocument( MessageHandle $handle, $text, $revId ) { - $language = $handle->getCode(); - $translationTitle = $handle->getTitle(); - - $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); - $wiki = wfWikiID(); - $messageid = $title->getPrefixedText(); - $globalid = "$wiki-$messageid-$revId/$language"; - - $doc = new Solarium_Document_ReadWrite(); - $doc->wiki = $wiki; - $doc->uri = $translationTitle->getCanonicalURL(); - $doc->messageid = $messageid; - $doc->globalid = $globalid; - - $doc->language = $language; - $doc->content = $text; - $doc->setField( 'group', $handle->getGroupIds() ); - - return $doc; - } - - public function beginBootstrap() { - $update = $this->client->createUpdate(); - $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() ); - $update->addDeleteQuery( $query ); - $update->addCommit(); - $this->client->update( $update ); - } - - public function beginBatch() { - // I hate the rule that forbids {} - } - - public function batchInsertDefinitions( array $batch ) { - $lb = new LinkBatch(); - foreach ( $batch as $data ) { - $lb->addObj( $data[0]->getTitle() ); - } - $lb->execute(); - - $this->batchInsertTranslations( $batch ); - } - - public function batchInsertTranslations( array $batch ) { - $update = $this->client->createUpdate(); - foreach ( $batch as $key => $data ) { - list( $handle, $sourceLanguage, $text ) = $data; - $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); - $doc = $this->createDocument( $handle, $text, $revId ); - // Add document and commit within X seconds. - $update->addDocument( $doc, null, self::COMMIT_WITHIN ); - } - - $retries = 5; - - while ( $retries-- > 0 ) { - try { - $this->client->update( $update ); - break; - } catch ( Solarium_Client_HttpException $e ) { - if ( $retries === 0 ) { - throw $e; - } else { - $c = get_class( $e ); - $msg = $e->getMessage(); - $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); - sleep( 10 ); - } - } - } - } - - public function endBatch() { - $update = $this->client->createUpdate(); - $this->client->update( $update ); - } - - public function endBootstrap() { - $update = $this->client->createUpdate(); - $update->addCommit(); - $update->addOptimize(); - $this->client->update( $update ); - } - - public function getSolarium() { - return $this->client; - } - - public function setLogger( $logger ) { - $this->logger = $logger; - } - - // Can it get any uglier? - protected function logOutput( $text ) { - if ( $this->logger ) { - $this->logger->statusLine( "$text\n" ); - } - } - - /** - * Search interface - * @param string $queryString - * @param array $opts - * @param array $highlight - * @return array - */ - public function search( $queryString, $opts, $highlight ) { - $client = $this->getSolarium(); - - $query = $client->createSelect(); - $dismax = $query->getDisMax(); - $dismax->setQueryParser( 'edismax' ); - $query->setQuery( $queryString ); - $query->setRows( $opts['limit'] ); - $query->setStart( $opts['offset'] ); - - list( $pre, $post ) = $highlight; - $hl = $query->getHighlighting(); - $hl->setFields( 'text' ); - $hl->setSimplePrefix( $pre ); - $hl->setSimplePostfix( $post ); - $hl->setMaxAnalyzedChars( '5000' ); - $hl->setFragSize( '5000' ); - $hl->setSnippets( 1 ); - - $languageFilter = $opts['language']; - if ( $languageFilter !== '' ) { - $query->createFilterQuery( 'languageFilter' ) - ->setQuery( 'language:%P1%', [ $languageFilter ] ) - ->addTag( 'filter' ); - } - - $groupFilter = $opts['group']; - if ( $groupFilter !== '' ) { - $query->createFilterQuery( 'groupFilter' ) - ->setQuery( 'group:%P1%', [ $groupFilter ] ) - ->addTag( 'filter' ); - } - - $facetSet = $query->getFacetSet(); - - $language = $facetSet->createFacetField( 'language' ); - $language->setField( 'language' ); - $language->setMinCount( 1 ); - $language->addExclude( 'filter' ); - - $group = $facetSet->createFacetField( 'group' ); - $group->setField( 'group' ); - $group->setMinCount( 1 ); - $group->setMissing( true ); - $group->addExclude( 'filter' ); - - try { - return $client->select( $query ); - } catch ( Solarium_Client_HttpException $e ) { - throw new TTMServer( $e->getMessage() ); - } - } - - public function getFacets( $resultset ) { - return [ - 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ), - 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ), - ]; - } - - public function getTotalHits( $resultset ) { - return $resultset->getNumFound(); - } - - public function getDocuments( $resultset ) { - $highlighting = $resultset->getHighlighting(); - $ret = []; - foreach ( $resultset as $document ) { - $fields = iterator_to_array( $document ); - // Compatibility mapping - $fields['localid'] = $fields['messageid']; - - $hdoc = $highlighting->getResult( $document->globalid ); - $text = $hdoc->getField( 'text' ); - if ( $text === [] ) { - $text = $document->text; - } else { - $text = $text[0]; - } - - $fields['content'] = $text; - $ret[] = $fields; - } - - return $ret; - } -} diff --git a/MLEB/Translate/ttmserver/TTMServer.php b/MLEB/Translate/ttmserver/TTMServer.php index 98724869..2a7f0900 100644 --- a/MLEB/Translate/ttmserver/TTMServer.php +++ b/MLEB/Translate/ttmserver/TTMServer.php @@ -15,13 +15,22 @@ * @ingroup TTMServer */ class TTMServer { + /** @var array */ protected $config; - protected function __construct( $config ) { + /** + * @param array $config + */ + protected function __construct( array $config ) { $this->config = $config; } - public static function factory( $config ) { + /** + * @param array $config + * @return TTMServer|null + * @throws MWException + */ + public static function factory( array $config ) { if ( isset( $config['class'] ) ) { $class = $config['class']; @@ -38,7 +47,7 @@ class TTMServer { } } - throw new MWEXception( 'TTMServer with no type' ); + throw new MWException( 'TTMServer with no type' ); } /** @@ -60,12 +69,21 @@ class TTMServer { return new FakeTTMServer(); } + /** + * @param array[] $suggestions + * @return array[] + */ public static function sortSuggestions( array $suggestions ) { usort( $suggestions, [ __CLASS__, 'qualitySort' ] ); return $suggestions; } + /** + * @param array $a + * @param array $b + * @return int + */ protected static function qualitySort( $a, $b ) { list( $c, $d ) = [ $a['quality'], $b['quality'] ]; if ( $c === $d ) { @@ -137,15 +155,17 @@ class TTMServer { /** * Called from TranslateEditAddons::onSave * @param MessageHandle $handle - * @param string $text - * @param bool $fuzzy */ - public static function onChange( MessageHandle $handle, $text, $fuzzy ) { + public static function onChange( MessageHandle $handle ) { $job = TTMServerMessageUpdateJob::newJob( $handle, 'refresh' ); JobQueueGroup::singleton()->push( $job ); } - public static function onGroupChange( MessageHandle $handle, $old, $new ) { + /** + * @param MessageHandle $handle + * @param array $old + */ + public static function onGroupChange( MessageHandle $handle, $old ) { if ( $old === [] ) { // Don't bother for newly added messages return; diff --git a/MLEB/Translate/ttmserver/TTMServerMessageUpdateJob.php b/MLEB/Translate/ttmserver/TTMServerMessageUpdateJob.php index 7a6a91d7..2d59813e 100644 --- a/MLEB/Translate/ttmserver/TTMServerMessageUpdateJob.php +++ b/MLEB/Translate/ttmserver/TTMServerMessageUpdateJob.php @@ -34,20 +34,20 @@ class TTMServerMessageUpdateJob extends Job { * Number of *retries* allowed, 4 means we attempt * to run the job 5 times (1 initial attempt + 4 retries). */ - const MAX_ERROR_RETRY = 4; + protected const MAX_ERROR_RETRY = 4; /** * Constant used by backoffDelay(). * With 7 the cumulative delay between the first and last attempt is * between 8 and 33 minutes. */ - const WRITE_BACKOFF_EXPONENT = 7; + protected const WRITE_BACKOFF_EXPONENT = 7; /** * The maximum amount of time jobs delayed due to frozen services can remain * in the job queue. */ - const DROP_DELAYED_JOBS_AFTER = 86400; // 60 * 60 * 24 * 1; + public const DROP_DELAYED_JOBS_AFTER = 86400; // 60 * 60 * 24 * 1; /** * @param MessageHandle $handle diff --git a/MLEB/Translate/ttmserver/schema.xml b/MLEB/Translate/ttmserver/schema.xml deleted file mode 100644 index 0ed2f047..00000000 --- a/MLEB/Translate/ttmserver/schema.xml +++ /dev/null @@ -1,45 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- This is schema file for TTMServer using Solr as backend --> -<schema name="ttmserver" version="1.5"> - <types> - <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> - <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> - <fieldType name="tint" class="solr.TrieIntField" precisionStep="50" positionIncrementGap="0"/> - <!-- Our input can basically be in any language, so we use either - language agnostic processing or something that can adapt to - the language in question. --> - <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <!-- Consider using solr.ICUTokenizerFactory --> - <tokenizer class="solr.StandardTokenizerFactory"/> - <!-- Consider using solr.ICUNormalizer2FilterFactory --> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - </fieldType> - </types> - - <fields> - <field name="_version_" type="long" indexed="true" stored="true" /> - - <!-- If multiple wikis are using the same server, this will tell which one - owns this document. Maps to MediaWiki wfWikiId(). --> - <field name="wiki" type="string" indexed="true" stored="true" required="true" /> - <!-- Title::getPrefixedText() of the message definition page. --> - <field name="messageid" type="string" indexed="true" stored="true" required="true" /> - <!-- Consists of concatenation of wiki and messageid. --> - <field name="globalid" type="string" indexed="true" stored="true" required="true" /> - <!-- URL or something to the translation in the wiki. --> - <field name="uri" type="string" indexed="true" stored="true" required="true" /> - - <!-- FACETs: Language and groups. --> - <field name="language" type="string" indexed="true" stored="true" required="true" /> - <field name="group" multiValued="true" indexed="true" stored="true" type="string" /> - - <field name="content" type="string" indexed="true" stored="true" required="true" /> - - <field name="text" type="text_ws" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> - <copyField source="content" dest="text"/> - </fields> - <defaultSearchField>text</defaultSearchField> - <uniqueKey>globalid</uniqueKey> -</schema> |