git ssb

0+

cel / ssb-wikimedia



Tree: f026a5e330ff1154aa9bc4c080cf82ff95b4d63d

Files: f026a5e330ff1154aa9bc4c080cf82ff95b4d63d / bin.js

16126 bytesRaw
1#!/usr/bin/env node
2
3var fs = require('fs')
4var path = require('path')
5var URL = require('url')
6var http = require('http')
7var https = require('https')
8var crypto = require('crypto')
9var readline = require('readline')
10var os = require('os')
11
12var ssbClient = require('ssb-client')
13var pull = require('pull-stream')
14
15var pkg = require('./package')
16
17var userAgentBase = pkg.name + '/' + pkg.version
18var userAgentContact
19var userAgentBot = false
20
21function estimateMessageSize(content) {
22 var draftMsg = {
23 key: '%0000000000000000000000000000000000000000000=.sha256',
24 value: {
25 previous: '%0000000000000000000000000000000000000000000=.sha256',
26 author: '@0000000000000000000000000000000000000000000=.ed25519',
27 sequence: 100000,
28 timestamp: 1000000000000.0001,
29 hash: 'sha256',
30 content: content,
31 signature: '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000==.sig.ed25519'
32 }
33 }
34 return JSON.stringify(draftMsg, null, 2).length
35}
36
37function get(url, cb) {
38 var opts = URL.parse(url)
39 opts.headers = {
40 'User-Agent': userAgentBase
41 + (userAgentContact ? ' (' + userAgentContact + ')' : '')
42 + (userAgentBot ? ' bot' : '')
43 }
44 var h = opts.protocol === 'https:' ? https : http
45 h.get(opts, function (res) {
46 if (res.statusCode !== 200) {
47 console.error(res.headers, url)
48 return cb(new Error('HTTP ' + res.statusCode + ' ' + res.statusMessage))
49 }
50 var bufs = []
51 res.on('data', function (buf) {
52 bufs.push(buf)
53 })
54 res.on('end', function () {
55 res.removeListener('error', cb)
56 var str
57 try {
58 str = Buffer.concat(bufs).toString('utf8')
59 } catch(e) {
60 return cb(e)
61 }
62 cb(null, str)
63 })
64 res.on('error', cb)
65 })
66}
67
68function getJson(url, cb) {
69 get(url, function (err, str) {
70 var data
71 try {
72 data = JSON.parse(str)
73 } catch(e) {
74 return cb(e)
75 }
76 cb(null, data)
77 })
78}
79
80function publishDrafts(sbot, drafts, cb) {
81 var draftIdIndex = {}
82 drafts.forEach(function (draft, i) {
83 draftIdIndex[draft.draftId] = i
84 })
85 var ids = []
86
87 function replaceDraftIds(obj) {
88 if (typeof obj === 'string') {
89 var i = draftIdIndex[obj]
90 if (typeof i === 'number') {
91 var id = ids[i]
92 if (!id) throw new ReferenceError('draft referernces unknown message')
93 return id
94 }
95 } else if (Array.isArray(obj)) {
96 return obj.map(replaceDraftIds)
97 } else if (obj !== null && typeof obj === 'object') {
98 var o = {}
99 for (var k in obj) o[k] = replaceDraftIds(obj[k])
100 return o
101 }
102 return obj
103 }
104
105 pull(
106 pull.values(drafts),
107 pull.asyncMap(function (draft, cb) {
108 var content = replaceDraftIds(draft.content)
109 sbot.publish(content, function (err, msg) {
110 if (err) return cb(err)
111 ids.push(msg.key)
112 cb(null, msg)
113 })
114 }),
115 pull.collect(cb)
116 )
117}
118
119var args = process.argv.slice(2)
120var yes = false
121var dry = false
122var help = false
123var urls = []
124args.forEach(function (arg) {
125 if (arg[0] === '-') switch (arg) {
126 case '-n': return dry = true
127 case '-y': return yes = true
128 case '-h': return help = true
129 default: throw 'Unknown argument: ' + arg
130 } else urls.push(arg)
131})
132
133if (help) {
134 process.stdout.write(fs.readFileSync(path.join(__dirname, 'usage.txt')))
135 process.exit(0)
136}
137
138var apiHrefRe = /(?:>api\.php<|<link rel="EditURI").* href="([^"]+\/api\.php)(?:[?#][^"]*)?".*/
139
140ssbClient(function (err, sbot, config) {
141 if (err) throw err
142 var conf = config.wikimedia || {}
143 userAgentContact = conf.contact
144 userAgentBot = conf.bot
145
146 if (urls.length === 0) {
147 var pagesFile = path.join(config.path, 'wikimedia-pages.txt')
148 var pagesData = fs.readFileSync(pagesFile, 'utf8')
149 urls = pagesData.split('\n').filter(RegExp.prototype.test.bind(/^[^#]/))
150 if (!urls.length) {
151 console.log('No pages to sync.')
152 return sbot.close()
153 }
154 }
155
156 var siteWikiBases = {}
157 var pagesInfo = urls.map(function (page) {
158 // Note: this assumes the wiki is either at / or /wiki/
159 var m = /^(https?:\/\/.*?\/)(wiki\/)?(.*)$/.exec(page)
160 if (!m) throw 'Unable to parse page URL ' + page
161 return {
162 url: page,
163 site: m[1],
164 wikiBase: m[1] + (m[2] || ''),
165 title: m[3]
166 }
167 })
168 // Group page info by site (wiki base url)
169 var pagesInfoBySite = {}
170 pagesInfo.forEach(function (pageInfo) {
171 var infos = pagesInfoBySite[pageInfo.site]
172 || (pagesInfoBySite[pageInfo.site] = [])
173 infos.push(pageInfo)
174 siteWikiBases[pageInfo.site] = pageInfo.wikiBase
175 })
176
177 var apiBySite = {}
178 findApis()
179
180 function findApis() {
181 console.log('Finding Wikimedia APIs...')
182 // Some possible API locations:
183 // /mediawiki/api.php
184 // /wiki/api.php
185 // /w/api.php
186 // /api.php
187 // TODO: do this with less HTTP requests, or cache the results across runs
188 var waiting = 0
189 for (var site in pagesInfoBySite) (function (site) {
190 waiting++
191 var base = siteWikiBases[site]
192 var url = base + 'Special:Version'
193 get(url, function (err, html) {
194 if (err) throw err
195 var m = apiHrefRe.exec(html)
196 if (!m) throw new Error('Unable to find api.php for ' + site)
197 var api = URL.resolve(url, m[1])
198 apiBySite[site] = api
199 if (!--waiting) normalizeTitles()
200 })
201 }(site))
202 }
203
204 function normalizeTitles() {
205 console.log('Normalizing titles...')
206 var waiting = 0
207 for (var site in pagesInfoBySite) (function (site) {
208 var pagesInfoForSite = pagesInfoBySite[site]
209 var pagesInfoForSiteByTitle = {}
210 var titles = pagesInfoForSite.map(function (info) {
211 pagesInfoForSiteByTitle[info.title] = info
212 return info.title
213 })
214 var api = apiBySite[site]
215 var url = api
216 + '?format=json&action=query'
217 + '&titles=' + encodeURIComponent('\x1f' + titles.join('\x1f'))
218 + '&' // trailing & needed for some reason
219 waiting++
220 getJson(url, function (err, data) {
221 if (err) throw err
222 if (data.warnings) console.trace('Warnings:', data.warnings)
223 if (data.query.normalized) data.query.normalized.forEach(function (norm) {
224 var info = pagesInfoForSiteByTitle[norm.from]
225 if (!info) {
226 console.error(JSON.stringify({titles: titles, response: data}, 0, 2))
227 throw new Error('Unexpected title in server response')
228 }
229 // console.log('Normalized title', norm.from, norm.to)
230 info.title = norm.to
231 })
232 if (!--waiting) getRevisions()
233 })
234 }(site))
235 }
236
237 function getRevisions() {
238 console.log('Getting revisions...')
239 var userHashes = {}
240 pull(
241 pull.values(pagesInfo),
242 pull.asyncMap(function (pageInfo, cb) {
243 // Calculate blob id for page URL + title, for linking
244 pull(
245 pull.once(pageInfo.site + '\t' + pageInfo.title),
246 sbot.blobs.add(function (err, hash) {
247 pageInfo.hash = hash
248 cb(null, pageInfo)
249 })
250 )
251 }),
252 pull.asyncMap(function (pageInfo, cb) {
253 // Get previous messages for this page.
254 // Simple solution: find the revision with latest timestamp.
255 var maxRevTs = ''
256 var maxRevMsgId
257 pull(
258 sbot.links({
259 dest: pageInfo.hash,
260 rel: 'pageId',
261 values: true,
262 meta: false
263 }),
264 pull.filter(function (msg) {
265 var c = msg && msg.value && msg.value.content
266 return c
267 && c.type === 'wikimedia/revisions'
268 && c.site === pageInfo.site
269 && c.title === pageInfo.title
270 }),
271 pull.drain(function (msg) {
272 var c = msg && msg.value && msg.value.content
273 var revs = Array.isArray(c.revisions) && c.revisions
274 if (revs) revs.forEach(function (rev) {
275 if (rev && rev.timestamp > maxRevTs) {
276 maxRevTs = rev.timestamp
277 maxRevMsgId = msg.key
278 }
279 })
280 }, function (err) {
281 if (err) return cb(err)
282 pageInfo.latestMsgId = maxRevMsgId
283 pageInfo.latestRevTs = maxRevTs
284 cb(null, pageInfo)
285 })
286 )
287 }),
288 pull.map(function (pageInfo) {
289 // Get new revisions.
290 var rvcontinue, rvdone
291 var rvstart = pageInfo.latestRevTs
292 var prevId = pageInfo.latestMsgId
293 var aborted
294 var first = true
295 var revisions = pull(
296 function (abort, cb) {
297 if (aborted = abort) return cb(abort)
298 if (rvdone) return cb(true)
299
300 console.log('Getting revisions for', pageInfo.title + '...',
301 rvstart || '', rvcontinue || '')
302 var url = apiBySite[pageInfo.site]
303 + '?format=json&action=query&prop=revisions&rvslots=*'
304 + '&titles=' + encodeURIComponent(pageInfo.title)
305 + '&rvprop=ids|timestamp|comment|user|sha1|size|slotsha1|slotsize|content|roles|flags|tags'
306 + '&rvdir=newer'
307 + (rvcontinue ? '&rvcontinue=' + rvcontinue : '')
308 + (rvstart ? '&rvstart=' + rvstart : '')
309 + '&rvlimit=50'
310 getJson(url, function (err, data) {
311 if (aborted) return err && console.trace(err)
312 if (err) return cb(err)
313
314 var warnings = data.warnings
315 if (warnings) {
316 if (warnings.main) {
317 if (warnings.main['*'] === 'Unrecognized parameter: rvslots.') {
318 delete warnings.main['*']
319 if (Object.keys(warnings.main).length === 0) {
320 delete warnings.main
321 }
322 }
323 }
324 if (warnings.revisions) {
325 if (warnings.revisions['*'] === 'Unrecognized values for parameter "rvprop": slotsha1, slotsize, roles.') {
326 delete warnings.revisions['*']
327 if (Object.keys(warnings.revisions).length === 0) {
328 delete warnings.revisions
329 }
330 }
331 }
332 if (Object.keys(warnings).length > 0) {
333 console.trace('Warnings:', warnings)
334 }
335 }
336
337 rvcontinue = data.continue && data.continue.rvcontinue
338 if (!rvcontinue) rvdone = true
339 var page
340 if (data.query) for (var pageid in data.query.pages) {
341 page = data.query.pages[pageid]
342 if (page.title === pageInfo.title) break
343 else page = null
344 }
345 if (!page) {
346 console.trace(data.query.pages, pageInfo)
347 return cb(new Error('Unable to find page'))
348 }
349 var revs = page.revisions || []
350 console.log('Got ' + revs.length + ' revisions')
351 cb(null, revs)
352 })
353 },
354 pull.flatten(),
355
356 pull.filter(function (rev) {
357 if (rev.timestamp === rvstart && first) {
358 first = false
359 return false
360 }
361 return true
362 }),
363
364 pull.through(function (rev) {
365 if (!rev.slots) {
366 // old API does not use slots.
367 // Transform result to be forward-compatible.
368 rev.slots = {
369 main: {
370 size: rev.size,
371 sha1: rev.sha1,
372 contentmodel: rev.contentmodel,
373 contentformat: rev.contentformat,
374 '*': rev['*']
375 }
376 }
377 delete rev.contentmodel
378 delete rev.contentformat
379 delete rev['*']
380 }
381 // duplicate values supplied in new API in slotsize and slotsha1
382 delete rev.sha1
383 delete rev.size
384 }),
385
386 pull.asyncMap(function (rev, cb) {
387 // Calculate blob id for user page URL + title, for linking
388 var hash = userHashes[rev.user]
389 if (hash) {
390 rev.userId = hash
391 return cb(null, rev)
392 }
393 pull(
394 pull.once(pageInfo.site + '\tUser:' + rev.user),
395 sbot.blobs.add(function (err, hash) {
396 rev.userId = userHashes[rev.user] = hash
397 cb(null, rev)
398 })
399 )
400 }),
401
402 pull.asyncMap(function (rev, cb) {
403 var waiting = 0
404 for (var slot in rev.slots) (function (slot) {
405 waiting++
406 var slotInfo = rev.slots[slot]
407 var content = slotInfo['*']
408 if (!content) {
409 console.trace(slotInfo)
410 return cb(new Error('Missing content'))
411 }
412 var sha1 = crypto.createHash('sha1').update(content).digest('hex')
413 if (sha1 !== slotInfo.sha1) {
414 console.trace(slotInfo, sha1)
415 return cb(new Error('Mismatched content sha1'))
416 }
417 pull(
418 pull.once(content),
419 sbot.blobs.add(function (err, hash) {
420 if (err) return cb(err)
421 slotInfo.link = hash
422 delete slotInfo['*']
423 if (!--waiting) cb(null, rev)
424 })
425 )
426 }(slot))
427 })
428 )
429
430 var queuedRevisions = []
431 var ended
432 function cbDraft(content, cb) {
433 if (!content.revisions.length) {
434 console.log('No revisions for', pageInfo.title)
435 return cb(true)
436 }
437 console.log('Prepared a message',
438 'with', content.revisions.length, 'revisions',
439 'for', pageInfo.title)
440 prevId = '%' + crypto.createHash('sha256').update(JSON.stringify(content)).digest('base64') + '.draft6'
441 cb(null, {
442 draftId: prevId,
443 content: content
444 })
445 }
446
447 return function (abort, cb) {
448 if (abort) return revisions(abort, cb)
449 if (ended) return cb(true)
450 var content = {
451 type: 'wikimedia/revisions',
452 site: pageInfo.site,
453 title: pageInfo.title,
454 pageId: pageInfo.hash,
455 parents: prevId ? [prevId] : undefined,
456 revisions: queuedRevisions.splice(0)
457 }
458 revisions(null, function next(end, revision) {
459 if (ended = end) return cbDraft(content, cb)
460 content.revisions.push(revision)
461 if (estimateMessageSize(content) > 8192) {
462 queuedRevisions.push(content.revisions.pop())
463 // console.log('filled msg for ', pageInfo.title, ' with ', content.revisions.length, 'revisions')
464 return cbDraft(content, cb)
465 }
466 revisions(null, next)
467 })
468 }
469 }),
470 pull.flatten(),
471
472 pull.collect(function (err, drafts) {
473 if (err) throw err
474 if (dry) {
475 console.log(JSON.stringify(drafts, 0, 2))
476 return sbot.close()
477 }
478 if (!drafts.length) {
479 console.log('No messages to publish.')
480 return sbot.close()
481 }
482 if (yes) return confirmed(true)
483 var rl = readline.createInterface({
484 input: process.stdin,
485 output: process.stdout
486 })
487 rl.question('Publish ' + drafts.length + ' messages? [Y/n] ', function (answer) {
488 rl.close()
489 confirmed(!/^n/i.test(answer))
490 })
491 function confirmed(yes) {
492 if (!yes) return sbot.close()
493 publishDrafts(sbot, drafts, function (err, msgs) {
494 if (err) throw err
495 console.log('Published:\n' + msgs.map(function (msg) {
496 return msg.key
497 }).join('\n'))
498 sbot.close()
499 })
500 }
501 })
502 )
503 }
504})
505

Built with git-ssb-web