git ssb

0+

cel / ssb-wikimedia



Commit f026a5e330ff1154aa9bc4c080cf82ff95b4d63d

Support sites with various api.php locations

cel authored on 6/28/2020, 2:25:44 PM
cel committed on 6/28/2020, 2:39:56 PM
Parent: 23f1a2b8649b31b64b0da903aec6a9ab15b8fccd

Files changed

README.mdchanged
bin.jschanged
notes.txtchanged
README.mdView
@@ -24,9 +24,9 @@
2424
2525 - Message type is added.
2626 - Property "site" refers to the Wikimedia site base URL.
2727 - Property `pageId` is a [denormalization](https://en.wikipedia.org/wiki/Denormalization) used to facilitate querying SSB for an article. It is the SSB blob hash of the values for the site property and title property, separated by a tab ("\t").
28-- Content ("*" property in revision slots) is replaced with the id of the SSB blob containing that content, at property "link".
28 +- Content ("`*`" property in revision slots) is replaced with the id of the SSB blob containing that content, at property "link".
2929 - Property "parents" is an array of links to the latest previous message(s) of the same type for the same page containing previous revisions to the page. Any revision parent id referenced from the current message should be found in the current message or a message referenced in this parents array.
3030 - Property "userId" is added to each revision object. It is computed the same as `pageId` but for the User page of the revision author.
3131
3232 Example:
@@ -90,9 +90,9 @@
9090 - https://meta.wikimedia.org/wiki/Special:MyLanguage/User-Agent_policy
9191
9292 ## License
9393
94-Copyright (C) 2019 cel @f/6sQ6d2CMxRUhLpspgGIulDxDCwYD7DzFzPNr7u5AU=.ed25519
94 +Copyright (C) 2019-2020 Charles E. Lehner
9595
9696 This program is free software: you can redistribute it and/or modify
9797 it under the terms of the GNU Affero General Public License as
9898 published by the Free Software Foundation, either version 3 of the
bin.jsView
@@ -33,9 +33,9 @@
3333 }
3434 return JSON.stringify(draftMsg, null, 2).length
3535 }
3636
37-function getJson(url, cb) {
37 +function get(url, cb) {
3838 var opts = URL.parse(url)
3939 opts.headers = {
4040 'User-Agent': userAgentBase
4141 + (userAgentContact ? ' (' + userAgentContact + ')' : '')
@@ -52,22 +52,32 @@
5252 bufs.push(buf)
5353 })
5454 res.on('end', function () {
5555 res.removeListener('error', cb)
56- var buf = Buffer.concat(bufs)
57- bufs = null
58- var data
56 + var str
5957 try {
60- data = JSON.parse(buf.toString('utf8'))
58 + str = Buffer.concat(bufs).toString('utf8')
6159 } catch(e) {
6260 return cb(e)
6361 }
64- cb(null, data)
62 + cb(null, str)
6563 })
6664 res.on('error', cb)
6765 })
6866 }
6967
68 +function getJson(url, cb) {
69 + get(url, function (err, str) {
70 + var data
71 + try {
72 + data = JSON.parse(str)
73 + } catch(e) {
74 + return cb(e)
75 + }
76 + cb(null, data)
77 + })
78 +}
79 +
7080 function publishDrafts(sbot, drafts, cb) {
7181 var draftIdIndex = {}
7282 drafts.forEach(function (draft, i) {
7383 draftIdIndex[draft.draftId] = i
@@ -124,8 +134,10 @@
124134 process.stdout.write(fs.readFileSync(path.join(__dirname, 'usage.txt')))
125135 process.exit(0)
126136 }
127137
138 +var apiHrefRe = /(?:>api\.php<|<link rel="EditURI").* href="([^"]+\/api\.php)(?:[?#][^"]*)?".*/
139 +
128140 ssbClient(function (err, sbot, config) {
129141 if (err) throw err
130142 var conf = config.wikimedia || {}
131143 userAgentContact = conf.contact
@@ -140,52 +152,90 @@
140152 return sbot.close()
141153 }
142154 }
143155
156 + var siteWikiBases = {}
144157 var pagesInfo = urls.map(function (page) {
145- var m = /^(https?:\/\/.*?)(\/wiki)?\/(.*)$/.exec(page)
158 + // Note: this assumes the wiki is either at / or /wiki/
159 + var m = /^(https?:\/\/.*?\/)(wiki\/)?(.*)$/.exec(page)
146160 if (!m) throw 'Unable to parse page URL ' + page
147161 return {
148- site: m[1] + '/',
149- api: m[1] + (m[2] ? '/w' : '/wiki') + '/api.php',
162 + url: page,
163 + site: m[1],
164 + wikiBase: m[1] + (m[2] || ''),
150165 title: m[3]
151166 }
152167 })
153- var pagesInfoByApi = {}
168 + // Group page info by site (wiki base url)
169 + var pagesInfoBySite = {}
154170 pagesInfo.forEach(function (pageInfo) {
155- var infos = pagesInfoByApi[pageInfo.api] || (pagesInfoByApi[pageInfo.api] = [])
171 + var infos = pagesInfoBySite[pageInfo.site]
172 + || (pagesInfoBySite[pageInfo.site] = [])
156173 infos.push(pageInfo)
174 + siteWikiBases[pageInfo.site] = pageInfo.wikiBase
157175 })
158- console.log('Normalizing titles...')
159- var waiting = 0
160- for (var api in pagesInfoByApi) (function (api) {
161- var pagesInfoForApi = pagesInfoByApi[api]
162- var pagesInfoForApiByTitle = {}
163- var titles = pagesInfoForApi.map(function (info) {
164- pagesInfoForApiByTitle[info.title] = info
165- return info.title
166- })
167- var url = api + '?format=json&action=query' +
168- '&titles=' + encodeURIComponent('\x1f' + titles.join('\x1f')) +
169- '&' // trailing & needed for some reason
170- waiting++
171- getJson(url, function (err, data) {
172- if (err) throw err
173- if (data.warnings) console.trace('Warnings:', data.warnings)
174- if (data.query.normalized) data.query.normalized.forEach(function (norm) {
175- var info = pagesInfoForApiByTitle[norm.from]
176- if (!info) {
177- console.error(JSON.stringify({titles: titles, response: data}, 0, 2))
178- throw new Error('Unexpected title in server response')
179- }
180- // console.log('Normalized title', norm.from, norm.to)
181- info.title = norm.to
176 +
177 + var apiBySite = {}
178 + findApis()
179 +
180 + function findApis() {
181 + console.log('Finding Wikimedia APIs...')
182 + // Some possible API locations:
183 + // /mediawiki/api.php
184 + // /wiki/api.php
185 + // /w/api.php
186 + // /api.php
187 + // TODO: do this with less HTTP requests, or cache the results across runs
188 + var waiting = 0
189 + for (var site in pagesInfoBySite) (function (site) {
190 + waiting++
191 + var base = siteWikiBases[site]
192 + var url = base + 'Special:Version'
193 + get(url, function (err, html) {
194 + if (err) throw err
195 + var m = apiHrefRe.exec(html)
196 + if (!m) throw new Error('Unable to find api.php for ' + site)
197 + var api = URL.resolve(url, m[1])
198 + apiBySite[site] = api
199 + if (!--waiting) normalizeTitles()
182200 })
183- if (!--waiting) next()
184- })
185- }(api))
201 + }(site))
202 + }
186203
187- function next() {
204 + function normalizeTitles() {
205 + console.log('Normalizing titles...')
206 + var waiting = 0
207 + for (var site in pagesInfoBySite) (function (site) {
208 + var pagesInfoForSite = pagesInfoBySite[site]
209 + var pagesInfoForSiteByTitle = {}
210 + var titles = pagesInfoForSite.map(function (info) {
211 + pagesInfoForSiteByTitle[info.title] = info
212 + return info.title
213 + })
214 + var api = apiBySite[site]
215 + var url = api
216 + + '?format=json&action=query'
217 + + '&titles=' + encodeURIComponent('\x1f' + titles.join('\x1f'))
218 + + '&' // trailing & needed for some reason
219 + waiting++
220 + getJson(url, function (err, data) {
221 + if (err) throw err
222 + if (data.warnings) console.trace('Warnings:', data.warnings)
223 + if (data.query.normalized) data.query.normalized.forEach(function (norm) {
224 + var info = pagesInfoForSiteByTitle[norm.from]
225 + if (!info) {
226 + console.error(JSON.stringify({titles: titles, response: data}, 0, 2))
227 + throw new Error('Unexpected title in server response')
228 + }
229 + // console.log('Normalized title', norm.from, norm.to)
230 + info.title = norm.to
231 + })
232 + if (!--waiting) getRevisions()
233 + })
234 + }(site))
235 + }
236 +
237 + function getRevisions() {
188238 console.log('Getting revisions...')
189239 var userHashes = {}
190240 pull(
191241 pull.values(pagesInfo),
@@ -248,9 +298,10 @@
248298 if (rvdone) return cb(true)
249299
250300 console.log('Getting revisions for', pageInfo.title + '...',
251301 rvstart || '', rvcontinue || '')
252- var url = api + '?format=json&action=query&prop=revisions&rvslots=*'
302 + var url = apiBySite[pageInfo.site]
303 + + '?format=json&action=query&prop=revisions&rvslots=*'
253304 + '&titles=' + encodeURIComponent(pageInfo.title)
254305 + '&rvprop=ids|timestamp|comment|user|sha1|size|slotsha1|slotsize|content|roles|flags|tags'
255306 + '&rvdir=newer'
256307 + (rvcontinue ? '&rvcontinue=' + rvcontinue : '')
notes.txtView
@@ -36,4 +36,6 @@
3636 https://wiki.p2pfoundation.net/Scuttlebutt
3737 https://sudoroom.org/wiki/Secure_Scuttlebutt
3838 https://www.noisebridge.net/wiki/Pub
3939
40 +https://stackoverflow.com/questions/35323728/get-wikipedia-page-url-by-pageid
41 +https://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export

Built with git-ssb-web