Модуль:External data/new
multitask
task
Синтаксис:
{{#invoke:Модуль:External data/new|task
| url = https://example.org
| task = тексты автора
| context = издание
| (format string for data row)
| header = (format string for header)
| footer = (format string for footer)
| default = (format string to replace empty result set)
| header template = (template for header)
| footer template = (template for footer)
| default template = (template to replace empty result set)
}}
Примеры
continued
Синтаксис:
{{#invoke:Модуль:External data/new|continued
| url = https://example.org
| __next = field code (XPath, CSS, etc.)
| (format string for data row)
| field1 = (field code)
...
| fieldn = (field code)
| header = (format string for header)
| footer = (format string for footer)
| default = (format string to replace empty result set)
| header template = (template for header)
| footer template = (template for footer)
| default template = (template to replace empty result set)
}}
Примеры
Wikicode | Result | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
{{#invoke:External data/new|continued
| url = https://www.apn.ru/index.php?do=authors&author=637
| __encoding = windows-1251
| __next = div#navi>em.bbcodes>a.attr(href)
| {{!}}-
{{!}} [{{{div.left_cell>h2>a.attr(href)}}} {{{div.left_cell>h2>a}}}] {{!}}{{!}} {{{div.left_cell>ul>li:nth-of-type(2)}}}
| __time = __time
| header = {{{!}} class="wikitable"
! Заглавие !! Дата
| footer template = External Data/footer
}} |
Источник: https://www.apn.ru/index.php?cstart=2&do=authors&author=637. 29 ноября 2024 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
{{#invoke:External data/new|continued
| url = http://www.globalrus.ru/author/56614/
| __format = HTML with XPath
| __next =
| {{!}}-
{{!}} [{{{//span[@class="author"]/a[@class="anncHeader"]/@href}}} {{{//span[@class="author"]/a[@class="anncHeader"]}}}] {{!}}{{!}} {{{//span[@class="author"]/a[@class="anncHeader"]/preceding-sibling::text()[1]}}}
| __time = __time
| header = {{{!}} class="wikitable"
! Заглавие !! Дата
| footer template = External Data/footer
}} |
Источник: http://www.globalrus.ru/author/56614/. 29 ноября 2024 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
{{#invoke:External data/new|continued
| url = https://politconservatism.ru/author/krylov
| __format = HTML with XPath
| __next = /html/body/main/div/nav/div/a[@class="next page-numbers"]/@href
| {{!}}-
{{!}} [{{{//h2/a/@href}}} {{{//h2/a}}}] {{!}}{{!}} {{{//li[@class="post-date meta-wrapper"]/span/a}}}
| __time = __time
| header = {{{!}} class="wikitable"
! Заглавие !! Дата
| footer template = External Data/footer
}} |
Источник: https://politconservatism.ru/author/krylov. 29 ноября 2024 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
{{#invoke:External data/new|continued
| url = https://web.archive.org/web/20221130095513/http://rulife.ru/old/name/author/39/
| __format = HTML with XPath
| __next = /html/body/main/div/nav/div/a[@class="next page-numbers"]/@href
| {{!}}-
{{!}} [{{{//a[@class="anons2"]/@href}}} {{{///a[@class="anons2"]}}}] {{!}}{{!}}
| __time = __time
| header = {{{!}} class="wikitable"
! Заглавие !! Дата
| footer template = External Data/footer
}} |
Источник: https://web.archive.org/web/20221130095513/http://rulife.ru/old/name/author/39/. 29 ноября 2024 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
{{#invoke:External data/new|continued
| url = https://web.archive.org/web/20221210015711/http://www.rus-obr.ru/users/konstantin-krylov
| __format = text
| __regex = %<a\s+href\s*=\s*"/web/\d{14}/(?<href>[^"]+)"[^>]*>(?<date>\d\d\.\d\d\.\d\d\d\d)\s+-\s+(?<title>[^<]+)</a>%
| {{!}}-
{{!}} [{{{href}}} {{{title}}}] {{!}}{{!}} {{{date}}}
| __time = __time
| header = {{{!}} class="wikitable"
! Заглавие !! Дата
| footer template = External Data/footer
}} |
Источник: https://web.archive.org/web/20221210015711/http://www.rus-obr.ru/users/konstantin-krylov. 29 ноября 2024 |
local get = mw.ext.externalData.getExternalData
local clone = mw.clone
local decode, new_url, validate = mw.text.decode, mw.uri.new, mw.uri.validate
local trim = mw.text.trim
local ustring = mw.ustring
local sub, gmatch, gsub, gsplit = ustring.sub, ustring.gmatch, ustring.gsub, mw.text.gsplit
local nbsp = ustring.char (160)
local concat = table.concat
local dump = mw.dumpObject
local set_property, subobject, ask, query = mw.smw.set, mw.smw.subobject, mw.smw.ask, mw.smw.getQueryResult
--[[
Service functions
--]]
local function merge (...)
local merged = {}
for _, arg in ipairs {...} do
for key, value in pairs (type (arg) == 'table' and arg or { arg } ) do
merged [type (key) == 'number' and #merged + 1 or key] = value
end
end
return merged
end
-- Because {unpack {1, 2}, 3} is counter-intuitive:
local function unpack_all (...)
return unpack (merge (...))
end
local function set (table)
local set = {}
for _, value in pairs (table) do
set [value] = true
end
return set
end
-- Standard Lua hashing does not work correctly:
local function hash_params (params)
return concat (params, '\n')
end
local function absolute_url (url, base)
local parsed_url = new_url (decode (url))
local parsed_base = new_url (base)
for fragment, default in pairs {
protocol = 'https',
authority ='??',
hostPort = ''
} do
parsed_url [fragment] = parsed_url [fragment] or parsed_base [fragment] or default
end
return tostring (parsed_url)
end
local function reduce_data_and_errors (data, errors, new_data, new_errors)
return merge (data, new_data), merge (errors, new_errors)
end
local function url_extractor (fields)
return function (url, data, errors)
if #( errors or {} ) > 0 then
return nil
end
local param_set = {}
for _, field in ipairs (fields) do
local values = data [field]
for __, new_url in ipairs (type (values) == 'table' and values or { values }) do
param_set [#param_set + 1] = { absolute_url (decode (new_url), url) }
end
end
return param_set
end
end
local function add_implicit_fields (fields, formats)
local all_fields = clone (fields) or {}
for _, format in pairs (formats) do
for macro in format:gmatch '{{{(.-)}}}' do
if not all_fields [macro] then
all_fields [macro] = macro
end
end
end
return all_fields
end
local function expand (format, row)
return gsub (format, '{{{(.-)}}}', row)
end
local function only_scalar (tbl)
local scalar = {}
for key, value in pairs (tbl) do
if type (value) == 'string' or type (value) == 'number' then
scalar [key] = value
end
end
return scalar
end
local function expand_or_template (row, template, format_string, frame, default)
return template and frame:expandTemplate{ title = template, args = only_scalar (row) }
or format_string and expand (format_string, row)
or default
end
--[[
Main functions
--]]
local function fallbacks (func, validator, failure)
return function (variants)
for _, variant in ipairs (variants) do
local returns = { func (unpack (variant)) }
if validator and validator (unpack (returns)) or returns [1] then
return unpack (returns)
end
end
return failure or false
end
end
local pages_limit = 50
local function multi_page (func, continuations, reduce)
return function (...)
local queue, processed, reduced = { {...} }, {}, { {}, {} }
local pages = 0
while #queue > 0 and pages < pages_limit do
local params = table.remove (queue, 1)
local hashed = hash_params (params)
if not processed [hashed] then
local returns = { func (unpack (params)) }
processed [hashed] = true
pages = pages + 1
-- Unfortunately, re-packing unpacked tables is buggy:
-- reduced = { reduce (unpack (reduced), unpack (returns)) }
reduced = { (reduce or reduce_data_and_errors) (unpack_all (reduced, returns)) }
local new_params = continuations (unpack (params), unpack (returns))
if new_params then
for _, param_set in ipairs (new_params) do
queue [#queue + 1] = param_set
end
end
end
end
return unpack (reduced)
end
end
local function scraper (connection, fields, further, constants)
local connection = connection
connection.data = clone (fields)
for _, field in ipairs (further or {}) do
connection.data [field] = field
end
return multi_page (function (urls)
local data, errors = {}, {}
for url in gsplit (urls, '%s+') do
connection.url = url
local add_data, add_errors = get (connection)
if not add_errors then
add_data = merge (constants, add_data, { __url = url })
for i, row in ipairs (add_data) do
add_data [i] = merge (constants, row, { __url = url })
end
end
data, errors = merge (data, add_data), merge (errors, add_errors)
end
return data, errors
end, url_extractor (further or {}))
end
local default_connection = {
format = 'HTML',
encoding = 'UTF-8',
cache = 86400,
stale = true,
regex = ''
}
local reserved = set {
1,
'template', 'header template', 'body template', 'footer template',
'header', 'footer', 'default',
'__format', '__next', '__prev', '__page', 'url'
}
local function is_reserved (key)
return type (key) == 'string' and key:sub (1, 1) == '*'
or reserved [key]
or default_connection [key:sub (3)]
end
local function formatting_params (args)
local formats, templates, params = {}, {}, {}
for arg, value in pairs (args) do
if arg == 1 then
formats.body = value
elseif arg == 'template' then
templates.body = value
elseif arg == 'header' or arg == 'footer' or arg == 'default' then
formats [arg] = value
elseif type (arg) == 'string' and arg:sub (-9) == ' template' then
templates [arg:sub (1, -10)] = value
elseif type (arg) == 'string' and arg:sub (1, 1) == '*' then
params [arg:sub (2)] = value
end
end
return formats, templates, params
end
local function connection_params (args, default)
local connection = clone (default)
for arg, value in pairs (args) do
if type (arg) == 'string' and arg:sub (1, 2) == '__' and default [arg:sub (3)] ~= nil then
connection [arg:sub (3)] = value
end
end
return connection
end
local function extraction_params (args)
local fields, further, url = {}, {}, nil
for arg, value in pairs (args) do
if arg == '__next' or arg == '__prev' or arg == '__page' then
if value ~= '' then
further [#further + 1] = value
end
else
if value ~= '' and not is_reserved (arg) then
fields [arg] = value
end
end
end
return fields, further
end
local __next_label = 'следующие тексты'
local function strip_pre (pre)
return pre and gsub( decode (sub (pre, 21, -7)), nbsp, ' ') or nil
end
local function extraction_params4task (task, context)
local results = ask {
'[[Scraping task::' .. task .. ']] [[Context::' .. context .. ']]',
'?Semantics=semantics',
'?Selector type=format',
'?Parsing regex#-=regex',
'?Selector#-=selector',
'?Context.Text encoding#-=encoding',
links = 'none', mainlabel = '-'
}
if #( results or {} ) == 0 then
return nil
end
local format, regex = results[1].format, strip_pre (results[1].regex)
local encoding = results[1].encoding or ''
local fields, further = {}, {}
for _, line in ipairs (results) do
if line.semantics then
if line.semantics == __next_label then
further [#further + 1] = strip_pre (line.selector)
else
fields [line.semantics] = strip_pre (line.selector)
end
end
end
return fields, further, encoding, format, regex
end
local function result_formatter (formats, templates, params, frame)
return function (rows, errors)
local lines = { --[==[ '<pre><nowiki>' .. dump {
formats = formats,
templates = templates,
params = params,
connection = connection,
fields = fields,
further = further
} .. '</nowiki></pre>' ]==] }
if #rows > 0 and #errors == 0 then
lines [#lines + 1] = expand_or_template (merge (rows, params), templates.header, formats.header, frame)
for _, row in ipairs (rows) do
lines [#lines + 1] = expand_or_template (merge (row, params), templates.body, formats.body, frame, dump (row))
end
lines [#lines + 1] = expand_or_template (merge (rows, params), templates.footer, formats.footer, frame)
else
lines [#lines + 1] = expand_or_template (
merge (rows, params),
templates.default,
formats.default or '<span class="error">' .. concat (errors) .. '</span>',
frame
)
end
return concat (lines, '\n')
end
end
local function extract (url, connection, fields, further, constants, frame)
local formats, templates, params = formatting_params (frame.args)
fields = add_implicit_fields (fields, formats)
local formatter = result_formatter (formats, templates, params, frame)
local rows, errors = scraper (connection, fields, further, constants) (url)
return formatter (rows, errors)
end
--[[
Exported functions
--]]
return {
continued = function (frame)
local connection = connection_params (frame.args, default_connection)
local fields, further = extraction_params (frame.args)
local dbg = '' --'\n<pre><nowiki>' .. dump { fields = fields, further = further, connection = connection } .. '</nowiki></pre>\n'
return dbg .. extract (frame.args.url, connection, fields, further, {}, frame)
end,
task = function (frame)
local task, context = frame.args.task, frame.args.context
local connection = connection_params (frame.args, default_connection)
local fields, further, encoding, format, regex = extraction_params4task (task, context)
connection.encoding = encoding or connection.encoding
connection.format = format or connection.format
connection.regex = regex or connection.regex
local dbg = '' --'\n<pre><nowiki>' .. dump { fields = fields, further = further, connection = connection } .. '</nowiki></pre>\n'
local constants = { __task = task, __context = context }
return dbg .. extract (frame.args.url, connection, fields, further, constants, frame)
end,
multitask = function (frame)
local task = frame.args.task
local connection = connection_params (frame.args, default_connection)
local parts = {}
for context, url in pairs (frame.args) do
if not is_reserved (context) and url:sub (1, 4) == 'http' then
local fields, further, encoding, format, regex = extraction_params4task (task, context)
connection.encoding = encoding or connection.encoding
connection.format = format or connection.format
connection.regex = regex or connection.regex
local constants = { __task = task, __context = context }
parts [#parts + 1] = extract (url, connection, fields, further, constants, frame)
end
end
return concat (parts, '\n')
end,
describe = function (frame)
local args = clone (frame.args)
if frame:getParent() then
args = merge (frame:getParent().args, args)
end
local context, task, format, regex = args.__context, args.__task, args.__format, args.__regex
local report = { '* [[' .. context .. ']]' }
if args.__encoding and args.__encoding ~= '' then
set_property { ['Text encoding'] = args.__encoding }
report [#report + 1] = '** Кодировка: ' .. args.__encoding
end
if args.__regex and args.__regex ~= '' then
report [#report + 1] = '** Регулярное выражение: ' .. args.__regex
end
subobject {
Context = context,
['Scraping task'] = task,
['Selector type'] = format,
['Parsing regex'] = regex,
Selector = '__time',
Semantics = '__time'
}
for arg, value in pairs (args) do
if arg:sub (1, 2) ~= '__' then
subobject {
Context = context,
['Scraping task'] = task,
['Selector type'] = format,
['Parsing regex'] = regex,
Selector = value,
Semantics = arg
}
report [#report + 1] = '** ' .. arg .. ' = <code><nowiki>' .. value .. '</nowiki></code>'
end
end
return frame:preprocess (concat (report, '\n'))
end,
trim_pipes = function (frame)
return trim (frame.args.str:gsub ('|', ''))
end,
full_url = function (frame)
return absolute_url (frame.args.url, frame.args.base)
end
}
--[[
To tesk multitask from console:
=mw.dumpObject (p.task (mw.getCurrentFrame():newChild { title = 'Test', args = {context = 'АПН', task = 'тексты автора', url = 'https://www.apn.ru/index.php?do=authors&author=637', template = 'Статья-строка таблицы', ['*автор'] = 'Александр Машин', ['*издание'] = 'АПН', ['*традиция'] = 'да', ['*база'] = 'https://www.apn.ru/index.php?do=authors&author=637', ['footer template'] = 'External Data/footer', default = 'не найдено'}}))
--]]