search.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. /**
  2. * Search
  3. */
  4. var elasticsearch = require('elasticsearch'),
  5. debug = require('debug')('growi:lib:search');
  6. function SearchClient(crowi, esUri) {
  7. this.DEFAULT_OFFSET = 0;
  8. this.DEFAULT_LIMIT = 50;
  9. this.esUri = esUri;
  10. this.crowi = crowi;
  11. var uri = this.parseUri(this.esUri);
  12. this.host = uri.host;
  13. this.index_name = uri.index_name;
  14. this.client = new elasticsearch.Client({
  15. host: this.host,
  16. requestTimeout: 5000,
  17. //log: 'debug',
  18. });
  19. this.registerUpdateEvent();
  20. this.mappingFile = crowi.resourceDir + 'search/mappings.json';
  21. }
  22. SearchClient.prototype.checkESVersion = function() {
  23. // TODO
  24. };
  25. SearchClient.prototype.registerUpdateEvent = function() {
  26. var pageEvent = this.crowi.event('page');
  27. pageEvent.on('create', this.syncPageCreated.bind(this));
  28. pageEvent.on('update', this.syncPageUpdated.bind(this));
  29. pageEvent.on('delete', this.syncPageDeleted.bind(this));
  30. };
  31. SearchClient.prototype.shouldIndexed = function(page) {
  32. // FIXME: Magic Number
  33. if (page.grant !== 1) {
  34. return false;
  35. }
  36. if (page.redirectTo !== null) {
  37. return false;
  38. }
  39. if (page.isDeleted()) {
  40. return false;
  41. }
  42. return true;
  43. };
  44. // BONSAI_URL is following format:
  45. // => https://{ID}:{PASSWORD}@{HOST}
  46. SearchClient.prototype.parseUri = function(uri) {
  47. var index_name = 'crowi';
  48. var host = uri;
  49. if (m = uri.match(/^(https?:\/\/[^\/]+)\/(.+)$/)) {
  50. host = m[1];
  51. index_name = m[2];
  52. }
  53. return {
  54. host,
  55. index_name,
  56. };
  57. };
  58. SearchClient.prototype.buildIndex = function(uri) {
  59. return this.client.indices.create({
  60. index: this.index_name,
  61. body: require(this.mappingFile)
  62. });
  63. };
  64. SearchClient.prototype.deleteIndex = function(uri) {
  65. return this.client.indices.delete({
  66. index: this.index_name,
  67. });
  68. };
  69. SearchClient.prototype.prepareBodyForUpdate = function(body, page) {
  70. if (!Array.isArray(body)) {
  71. throw new Error('Body must be an array.');
  72. }
  73. var command = {
  74. update: {
  75. _index: this.index_name,
  76. _type: 'pages',
  77. _id: page._id.toString(),
  78. }
  79. };
  80. var document = {
  81. doc: {
  82. path: page.path,
  83. body: page.revision.body,
  84. comment_count: page.commentCount,
  85. bookmark_count: 0, // todo
  86. like_count: page.liker.length || 0,
  87. updated_at: page.updatedAt,
  88. },
  89. doc_as_upsert: true,
  90. };
  91. body.push(command);
  92. body.push(document);
  93. };
  94. SearchClient.prototype.prepareBodyForCreate = function(body, page) {
  95. if (!Array.isArray(body)) {
  96. throw new Error('Body must be an array.');
  97. }
  98. var command = {
  99. index: {
  100. _index: this.index_name,
  101. _type: 'pages',
  102. _id: page._id.toString(),
  103. }
  104. };
  105. var document = {
  106. path: page.path,
  107. body: page.revision.body,
  108. username: page.creator.username,
  109. comment_count: page.commentCount,
  110. bookmark_count: 0, // todo
  111. like_count: page.liker.length || 0,
  112. created_at: page.createdAt,
  113. updated_at: page.updatedAt,
  114. };
  115. body.push(command);
  116. body.push(document);
  117. };
  118. SearchClient.prototype.prepareBodyForDelete = function(body, page) {
  119. if (!Array.isArray(body)) {
  120. throw new Error('Body must be an array.');
  121. }
  122. var command = {
  123. delete: {
  124. _index: this.index_name,
  125. _type: 'pages',
  126. _id: page._id.toString(),
  127. }
  128. };
  129. body.push(command);
  130. };
  131. SearchClient.prototype.addPages = function(pages) {
  132. var self = this;
  133. var body = [];
  134. pages.map(function(page) {
  135. self.prepareBodyForCreate(body, page);
  136. });
  137. debug('addPages(): Sending Request to ES', body);
  138. return this.client.bulk({
  139. body: body,
  140. });
  141. };
  142. SearchClient.prototype.updatePages = function(pages) {
  143. var self = this;
  144. var body = [];
  145. pages.map(function(page) {
  146. self.prepareBodyForUpdate(body, page);
  147. });
  148. debug('updatePages(): Sending Request to ES', body);
  149. return this.client.bulk({
  150. body: body,
  151. });
  152. };
  153. SearchClient.prototype.deletePages = function(pages) {
  154. var self = this;
  155. var body = [];
  156. pages.map(function(page) {
  157. self.prepareBodyForDelete(body, page);
  158. });
  159. debug('deletePages(): Sending Request to ES', body);
  160. return this.client.bulk({
  161. body: body,
  162. });
  163. };
  164. SearchClient.prototype.addAllPages = function() {
  165. var self = this;
  166. var Page = this.crowi.model('Page');
  167. var cursor = Page.getStreamOfFindAll();
  168. var body = [];
  169. var sent = 0;
  170. var skipped = 0;
  171. return new Promise(function(resolve, reject) {
  172. cursor.on('data', function(doc) {
  173. if (!doc.creator || !doc.revision || !self.shouldIndexed(doc)) {
  174. //debug('Skipped', doc.path);
  175. skipped++;
  176. return ;
  177. }
  178. self.prepareBodyForCreate(body, doc);
  179. //debug(body.length);
  180. if (body.length > 2000) {
  181. sent++;
  182. debug('Sending request (seq, skipped)', sent, skipped);
  183. self.client.bulk({
  184. body: body,
  185. requestTimeout: Infinity,
  186. }).then(res => {
  187. debug('addAllPages add anyway (items, errors, took): ', (res.items || []).length, res.errors, res.took);
  188. }).catch(err => {
  189. debug('addAllPages error on add anyway: ', err);
  190. });
  191. body = [];
  192. }
  193. }).on('error', function(err) {
  194. // TODO: handle err
  195. debug('Error cursor:', err);
  196. }).on('close', function() {
  197. // all done
  198. // return if body is empty
  199. // see: https://github.com/weseek/growi/issues/228
  200. if (body.length == 0) {
  201. return resolve();
  202. }
  203. // 最後にすべてを送信
  204. self.client.bulk({
  205. body: body,
  206. requestTimeout: Infinity,
  207. })
  208. .then(function(res) {
  209. debug('Reponse from es (item length, errros, took):', (res.items || []).length, res.errors, res.took);
  210. return resolve(res);
  211. }).catch(function(err) {
  212. debug('Err from es:', err);
  213. return reject(err);
  214. });
  215. });
  216. });
  217. };
  218. /**
  219. * search returning type:
  220. * {
  221. * meta: { total: Integer, results: Integer},
  222. * data: [ pages ...],
  223. * }
  224. */
  225. SearchClient.prototype.search = function(query) {
  226. var self = this;
  227. return new Promise(function(resolve, reject) {
  228. self.client.search(query)
  229. .then(function(data) {
  230. var result = {
  231. meta: {
  232. took: data.took,
  233. total: data.hits.total,
  234. results: data.hits.hits.length,
  235. },
  236. data: data.hits.hits.map(function(elm) {
  237. return {_id: elm._id, _score: elm._score};
  238. })
  239. };
  240. resolve(result);
  241. }).catch(function(err) {
  242. reject(err);
  243. });
  244. });
  245. };
  246. SearchClient.prototype.createSearchQuerySortedByUpdatedAt = function(option) {
  247. // getting path by default is almost for debug
  248. var fields = ['path'];
  249. if (option) {
  250. fields = option.fields || fields;
  251. }
  252. // default is only id field, sorted by updated_at
  253. var query = {
  254. index: this.index_name,
  255. type: 'pages',
  256. body: {
  257. sort: [{ updated_at: { order: 'desc'}}],
  258. query: {}, // query
  259. _source: fields,
  260. }
  261. };
  262. this.appendResultSize(query);
  263. return query;
  264. };
  265. SearchClient.prototype.createSearchQuerySortedByScore = function(option) {
  266. var fields = ['path'];
  267. if (option) {
  268. fields = option.fields || fields;
  269. }
  270. // sort by score
  271. var query = {
  272. index: this.index_name,
  273. type: 'pages',
  274. body: {
  275. sort: [ {_score: { order: 'desc'} }],
  276. query: {}, // query
  277. _source: fields,
  278. }
  279. };
  280. this.appendResultSize(query);
  281. return query;
  282. };
  283. SearchClient.prototype.appendResultSize = function(query, from, size) {
  284. query.from = from || this.DEFAULT_OFFSET;
  285. query.size = size || this.DEFAULT_LIMIT;
  286. };
  287. SearchClient.prototype.appendCriteriaForKeywordContains = function(query, keyword) {
  288. // query is created by createSearchQuerySortedByScore() or createSearchQuerySortedByUpdatedAt()
  289. if (!query.body.query.bool) {
  290. query.body.query.bool = {};
  291. }
  292. if (!query.body.query.bool.must || !Array.isArray(query.body.query.must)) {
  293. query.body.query.bool.must = [];
  294. }
  295. if (!query.body.query.bool.must_not || !Array.isArray(query.body.query.must_not)) {
  296. query.body.query.bool.must_not = [];
  297. }
  298. var appendMultiMatchQuery = function(query, type, keywords) {
  299. var target;
  300. var operator = 'and';
  301. switch (type) {
  302. case 'not_match':
  303. target = query.body.query.bool.must_not;
  304. operator = 'or';
  305. break;
  306. case 'match':
  307. default:
  308. target = query.body.query.bool.must;
  309. }
  310. target.push({
  311. multi_match: {
  312. query: keywords.join(' '),
  313. // TODO: By user's i18n setting, change boost or search target fields
  314. fields: [
  315. 'path_ja^2',
  316. 'path_en^2',
  317. 'body_ja',
  318. // "path_en",
  319. // "body_en",
  320. ],
  321. operator: operator,
  322. }
  323. });
  324. return query;
  325. };
  326. var parsedKeywords = this.getParsedKeywords(keyword);
  327. if (parsedKeywords.match.length > 0) {
  328. query = appendMultiMatchQuery(query, 'match', parsedKeywords.match);
  329. }
  330. if (parsedKeywords.not_match.length > 0) {
  331. query = appendMultiMatchQuery(query, 'not_match', parsedKeywords.not_match);
  332. }
  333. if (parsedKeywords.phrase.length > 0) {
  334. var phraseQueries = [];
  335. parsedKeywords.phrase.forEach(function(phrase) {
  336. phraseQueries.push({
  337. multi_match: {
  338. query: phrase, // each phrase is quoteted words
  339. type: 'phrase',
  340. fields: [ // Not use "*.ja" fields here, because we want to analyze (parse) search words
  341. 'path_raw^2',
  342. 'body_raw',
  343. ],
  344. }
  345. });
  346. });
  347. query.body.query.bool.must.push(phraseQueries);
  348. }
  349. if (parsedKeywords.not_phrase.length > 0) {
  350. var notPhraseQueries = [];
  351. parsedKeywords.not_phrase.forEach(function(phrase) {
  352. notPhraseQueries.push({
  353. multi_match: {
  354. query: phrase, // each phrase is quoteted words
  355. type: 'phrase',
  356. fields: [ // Not use "*.ja" fields here, because we want to analyze (parse) search words
  357. 'path_raw^2',
  358. 'body_raw',
  359. ],
  360. }
  361. });
  362. });
  363. query.body.query.bool.must_not.push(notPhraseQueries);
  364. }
  365. };
  366. SearchClient.prototype.appendCriteriaForPathFilter = function(query, path) {
  367. // query is created by createSearchQuerySortedByScore() or createSearchQuerySortedByUpdatedAt()
  368. if (!query.body.query.bool) {
  369. query.body.query.bool = {};
  370. }
  371. if (!query.body.query.bool.filter || !Array.isArray(query.body.query.bool.filter)) {
  372. query.body.query.bool.filter = [];
  373. }
  374. if (path.match(/\/$/)) {
  375. path = path.substr(0, path.length - 1);
  376. }
  377. query.body.query.bool.filter.push({
  378. wildcard: {
  379. 'path': path + '/*'
  380. }
  381. });
  382. };
  383. SearchClient.prototype.searchKeyword = function(keyword, option) {
  384. /* eslint-disable no-unused-vars */
  385. var from = option.offset || null;
  386. /* eslint-enable */
  387. var query = this.createSearchQuerySortedByScore();
  388. this.appendCriteriaForKeywordContains(query, keyword);
  389. return this.search(query);
  390. };
  391. SearchClient.prototype.searchByPath = function(keyword, prefix) {
  392. // TODO path 名だけから検索
  393. };
  394. SearchClient.prototype.searchKeywordUnderPath = function(keyword, path, option) {
  395. var from = option.offset || null;
  396. var query = this.createSearchQuerySortedByScore();
  397. this.appendCriteriaForKeywordContains(query, keyword);
  398. this.appendCriteriaForPathFilter(query, path);
  399. if (from) {
  400. this.appendResultSize(query, from);
  401. }
  402. return this.search(query);
  403. };
  404. SearchClient.prototype.getParsedKeywords = function(keyword) {
  405. var matchWords = [];
  406. var notMatchWords = [];
  407. var phraseWords = [];
  408. var notPhraseWords = [];
  409. keyword.trim();
  410. keyword = keyword.replace(/\s+/g, ' ');
  411. // First: Parse phrase keywords
  412. var phraseRegExp = new RegExp(/(-?"[^"]+")/g);
  413. var phrases = keyword.match(phraseRegExp);
  414. if (phrases !== null) {
  415. keyword = keyword.replace(phraseRegExp, '');
  416. phrases.forEach(function(phrase) {
  417. phrase.trim();
  418. if (phrase.match(/^-/)) {
  419. notPhraseWords.push(phrase.replace(/^-/, ''));
  420. }
  421. else {
  422. phraseWords.push(phrase);
  423. }
  424. });
  425. }
  426. // Second: Parse other keywords (include minus keywords)
  427. keyword.split(' ').forEach(function(word) {
  428. if (word === '') {
  429. return;
  430. }
  431. if (word.match(/^-(.+)$/)) {
  432. notMatchWords.push((RegExp.$1));
  433. }
  434. else {
  435. matchWords.push(word);
  436. }
  437. });
  438. return {
  439. match: matchWords,
  440. not_match: notMatchWords,
  441. phrase: phraseWords,
  442. not_phrase: notPhraseWords,
  443. };
  444. };
  445. SearchClient.prototype.syncPageCreated = function(page, user) {
  446. debug('SearchClient.syncPageCreated', page.path);
  447. if (!this.shouldIndexed(page)) {
  448. return ;
  449. }
  450. this.addPages([page])
  451. .then(function(res) {
  452. debug('ES Response', res);
  453. })
  454. .catch(function(err) {
  455. debug('ES Error', err);
  456. });
  457. };
  458. SearchClient.prototype.syncPageUpdated = function(page, user) {
  459. debug('SearchClient.syncPageUpdated', page.path);
  460. // TODO delete
  461. if (!this.shouldIndexed(page)) {
  462. this.deletePages([page])
  463. .then(function(res) {
  464. debug('deletePages: ES Response', res);
  465. })
  466. .catch(function(err) {
  467. debug('deletePages:ES Error', err);
  468. });
  469. return ;
  470. }
  471. this.updatePages([page])
  472. .then(function(res) {
  473. debug('ES Response', res);
  474. })
  475. .catch(function(err) {
  476. debug('ES Error', err);
  477. });
  478. };
  479. SearchClient.prototype.syncPageDeleted = function(page, user) {
  480. debug('SearchClient.syncPageDeleted', page.path);
  481. this.deletePages([page])
  482. .then(function(res) {
  483. debug('deletePages: ES Response', res);
  484. })
  485. .catch(function(err) {
  486. debug('deletePages:ES Error', err);
  487. });
  488. return ;
  489. };
  490. module.exports = SearchClient;