-
Notifications
You must be signed in to change notification settings - Fork 10
/
luaxml-domobject.lua
792 lines (715 loc) · 23.5 KB
/
luaxml-domobject.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
--- DOM module for LuaXML
-- @module luaxml-domobject
-- @author Michal Hoftich <michal.h21@gmail.com
local dom = {}
local xml
local handler
local css_query
local html
if kpse then
xml = require("luaxml-mod-xml")
handler = require("luaxml-mod-handler")
css_query = require("luaxml-cssquery")
html = require("luaxml-mod-html")
else
xml = require("luaxml.mod-xml")
handler = require("luaxml.mod-handler")
css_query = require("luaxml.cssquery")
html = require("luaxml.mod-html")
end
local HtmlParser = html.HtmlParser
local void = {area = true, base = true, br = true, col = true, hr = true, img = true, input = true, link = true, meta = true, param = true}
-- support also upper case names
for k,v in pairs(void) do void[string.upper(k)] = true end
local escapes = {
[">"] = ">",
["<"] = "<",
["&"] = "&",
['"'] = """,
["'"] = "'",
["`"] = "`"
}
-- declarations of local functions
local html_to_dom
local html_parse
local parse
local function escape(search, text)
return text:gsub(search, function(ch)
return escapes[ch] or ""
end)
end
local function escape_element(text)
return escape("([<>&])", text)
end
local function escape_attr(text)
return escape("([<>&\"'`])", text)
end
local actions = {
TEXT = {text = "%s"},
COMMENT = {start = "<!-- ", text = "%s", stop = " -->"},
ELEMENT = {start = "<%s%s>", stop = "</%s>", void = "<%s%s />"},
DECL = {start = "<?%s %s?>"},
PI = {start = "<?%s %s?>"},
DTD = {start = "<!DOCTYPE ", text = "%s" , stop=">"},
CDATA = {start = "<![CDATA[", text = "%s", stop ="]]>"}
}
--- It serializes the DOM object back to the XML.
-- This function is mainly used for internal purposes, it is better to
-- use the `DOM_Object:serialize()`.
-- @param parser DOM object
-- @param current Element which should be serialized
-- @param level
-- @param output
-- @return table Table with XML strings. It can be concenated using table.concat() function to get XML string corresponding to the DOM_Object.
local function serialize_dom(parser, current,level, output)
local output = output or {}
local function get_action(typ, action)
local ac = actions[typ] or {}
local format = ac[action] or ""
return format
end
local function insert(format, ...)
table.insert(output, string.format(format, ...))
end
local function prepare_attributes(attr)
local t = {}
local attr = attr or {}
for k, v in pairs(attr) do
t[#t+1] = string.format("%s='%s'", k, escape_attr(v))
end
-- sort attributes alphabetically. this will ensure that
-- their order will not change between several executions of dom:serialize()
table.sort(t)
if #t == 0 then return "" end
-- add space before attributes
return " " .. table.concat(t, " ")
end
local function start(typ, el, attr)
local format = get_action(typ, "start")
insert(format, el, prepare_attributes(attr))
end
local function text(typ, text, parent)
local parent = parent or {}
local format = get_action(typ, "text")
if parent.verbatim then
insert(format, text)
else
insert(format, escape_element(text))
end
end
local function stop(typ, el)
local format = get_action(typ, "stop")
insert(format,el)
end
local level = level or 0
local spaces = string.rep(" ",level)
local root= current or parser._handler.root
local name = root._name or "unnamed"
local xtype = root._type or "untyped"
local text_content = root._text or ""
local attributes = root._attr or {}
-- if xtype == "TEXT" then
-- print(spaces .."TEXT : " .. root._text)
-- elseif xtype == "COMMENT" then
-- print(spaces .. "Comment : ".. root._text)
-- else
-- print(spaces .. xtype .. " : " .. name)
-- end
-- for k, v in pairs(attributes) do
-- print(spaces .. " ".. k.."="..v)
-- end
if xtype == "DTD" then
text_content = string.format('%s %s "%s" "%s"', name, attributes["_type"] or "", attributes._name, attributes._uri )
-- remove unused fields
text_content = text_content:gsub('"nil"','')
text_content = text_content:gsub('%s*$','')
attributes = {}
elseif xtype == "ELEMENT" and void[name] and #current._children < 1 then
local format = get_action(xtype, "void")
insert(format, name, prepare_attributes(attributes))
return output
elseif xtype == "PI" then
-- it contains spurious _text attribute
attributes["_text"] = nil
elseif xtype == "DECL" and name =="xml" then
-- the xml declaration attributes must be in a correct order
local encoding = attributes.encoding or "utf-8"
insert("<?xml version='%s' encoding='%s' ?>", attributes.version, encoding)
return output
elseif xtype == "CDATA" then
-- return content unescaped
insert("<![CDATA[%s]]>", text_content)
return output
end
start(xtype, name, attributes)
text(xtype,text_content, (current or {})._parent)
local children = root._children or {}
for _, child in ipairs(children) do
output = serialize_dom(parser,child, level + 1, output)
end
stop(xtype, name)
return output
end
--- XML parsing function
-- Parse the XML text and create the DOM object.
-- @return DOM_Object
parse = function(
xmltext --- String to be parsed
,voidElements --- hash table with void elements
)
local domHandler = handler.domHandler()
--- @type DOM_Object
local DOM_Object = xml.xmlParser(domHandler)
-- preserve whitespace
DOM_Object.options.stripWS = nil
-- don't try to expand entities
DOM_Object.options.expandEntities = nil
local voidElements = voidElements or void
DOM_Object._handler.options.voidElements = voidElements
DOM_Object:parse(xmltext)
DOM_Object.current = DOM_Object._handler.root
DOM_Object.__index = DOM_Object
DOM_Object.css_query = css_query()
local function save_methods(element)
setmetatable(element,DOM_Object)
local children = element._children or {}
for _, x in ipairs(children) do
save_methods(x)
end
end
local parser = setmetatable({}, DOM_Object)
--- Returns root element of the DOM_Object
-- @return DOM_Object
function DOM_Object:root_node()
return self._handler.root
end
--- Get current node type
-- @param el [optional] node to get the type of
function DOM_Object:get_node_type(
el --- [optional] element to test
)
local el = el or self
return el._type
end
--- Test if the current node is an element.
-- You can pass different element as parameter
-- @return boolean
function DOM_Object:is_element(
el --- [optional] element to test
)
local el = el or self
return self:get_node_type(el) == "ELEMENT" -- @bool
end
--- Test if current node is text
-- @return boolean
function DOM_Object:is_text(
el --- [optional] element to test
)
local el = el or self
return self:get_node_type(el) == "TEXT"
end
local lower = string.lower
--- Return name of the current element
-- @return string
function DOM_Object:get_element_name(
el --- [optional] element to test
)
local el = el or self
return el._name or "unnamed"
end
--- Get value of an attribute
-- @return string
function DOM_Object:get_attribute(
name --- Attribute name
)
local el = self
if self:is_element(el) then
local attr = el._attr or {}
return attr[name]
end
end
--- Set value of an attribute
-- @return boolean
function DOM_Object:set_attribute(
name --- Attribute name
, value --- Value to be set
)
local el = self
if self:is_element(el) then
el._attr = el._attr or {}
el._attr[name] = value
return true
end
end
--- Serialize the current node back to XML
-- @return string
function DOM_Object:serialize(
current --- [optional] element to be serialized
)
local current = current
-- if no current element is added and self is not plain parser object
-- (_type is then nil), use the current object as serialized root
if not current and self._type then
current = self
end
return table.concat(serialize_dom(self, current))
end
--- Get text content from the node and all of it's children
-- @return string
function DOM_Object:get_text(
current --- [optional] element which should be converted to text
)
local current = current or self
local text = {}
if current:is_text() then return current._text or "" end
for _, el in ipairs(current:get_children()) do
if el:is_text() then
text[#text+1] = el._text or ""
elseif el._type == "CDATA" then
text[#text+1] = el._text or ""
elseif el:is_element() then
text[#text+1] = el:get_text()
end
end
return table.concat(text)
end
--- Retrieve elements from the given path.
-- The path is list of elements separated by space,
-- starting from the top element of the current element
-- @return table of elements which match the path
function DOM_Object:get_path(
path --- path to be traversed
, current --- [optional] element which should be traversed. Default element is the root element of the DOM_Object
)
local function traverse_path(path_elements, current, t)
local t = t or {}
if #path_elements == 0 then
-- for _, x in ipairs(current._children or {}) do
-- table.insert(t,x)
-- end
table.insert(t,current)
return t
end
local current_path = table.remove(path_elements, 1)
for _, x in ipairs(self:get_children(current)) do
if self:is_element(x) then
local name = string.lower(self:get_element_name(x))
if name == current_path then
t = traverse_path(path_elements, x, t)
end
end
end
return t
end
local current = current or self:root_node() -- self._handler.root
local path_elements = {}
local path = string.lower(path)
for el in path:gmatch("([^%s]+)") do table.insert(path_elements, el) end
return traverse_path(path_elements, current)
end
--- Select elements chidlren using CSS selector syntax
-- @return table with elements matching the selector.
function DOM_Object:query_selector(
selector --- String using the CSS selector syntax
)
local css_query = self.css_query
local css_parts = css_query:prepare_selector(selector)
return css_query:get_selector_path(self, css_parts)
end
--- Get table with children of the current element
-- @return table with children of the selected element
function DOM_Object:get_children(
el --- [optional] element to be selected
)
local el = el or self
local children = el._children or {}
return children
end
--- Get the parent element
-- @return DOM_Object parent element
function DOM_Object:get_parent(
el --- [optional] element to be selected
)
local el = el or self
return el._parent
end
--- Execute function on the current element and all it's children nodes.
-- The differenct to DOM_Object:traverse_elements() is that it executes the function
-- also on text nodes and all other kinds of XML nodes.
-- The traversing of child elements of a given node can be disabled when the executed
-- function returns false.
function DOM_Object:traverse(
fn, --- function which will be executed on the current element and all it's children
current --- [optional] element to be selected
)
local current = current or self --
-- Following situation may happen when this method is called directly on the parsed object
if not current:get_node_type() then
current = self:root_node()
end
local status = true
local status = fn(current)
if current:is_element() or current:get_node_type() == "ROOT" then
-- don't traverse child nodes when the user function return false
if status ~= false then
for _, child in ipairs(current:get_children()) do
self:traverse(fn, child)
end
end
end
end
--- Execute function on the current element and all it's children elements.
--- The traversing of child elements of a given node can be disabled when the executed
-- function returns false.
-- @return nothing
function DOM_Object:traverse_elements(
fn, --- function which will be executed on the current element and all it's children
current --- [optional] element to be selected
)
local current = current or self --
current:traverse(function(node)
if node:is_element() or node:get_node_type() == "ROOT" then
fn(node)
end
end)
end
--- Get table with the inner text of an element, every text node is a separate table item.
--- @return table
function DOM_Object:strings(
current --- [optional] element to be selected
)
local strings = {}
local current = current or self
current:traverse(function(node)
if node:get_node_type() == "TEXT" then
table.insert(strings, node._text or "")
end
end)
return strings
end
--- Get table with the inner text of an element - leading and trailing spaces are removed and elements that contain only white space are ignored.
-- @return table
function DOM_Object:stripped_strings(
current --- [optional] element to be selected
)
local current = current or self
local strings = current:strings()
local cleaned = {}
for k,v in ipairs(strings) do
v = v:gsub("^%s*", ""):gsub("%s*$", "")
if v ~= "" then
table.insert(cleaned, v)
end
end
return cleaned
end
--- Execute function on list of elements returned by DOM_Object:get_path()
function DOM_Object:traverse_node_list(
nodelist --- table with nodes selected by DOM_Object:get_path()
, fn --- function to be executed
)
local nodelist = nodelist or {}
for _, node in ipairs(nodelist) do
for _, element in ipairs(node._children) do
fn(element)
end
end
end
--- Replace the current node with new one
-- @return boolean, message
function DOM_Object:replace_node(
new --- element which should replace the current element
)
local old = self
local parent = self:get_parent(old)
local id,msg = self:find_element_pos( old)
if id then
parent._children[id] = new
return true
end
return false, msg
end
-- restore correct links to parent elements
local function fix_parents(el)
for k,v in ipairs(el._children or {}) do
if v:is_element() then
v._parent = el
fix_parents(v)
end
end
end
--- Add child node to the current node
function DOM_Object:add_child_node(
child, --- element to be inserted as a current node child
position --- [optional] position at which should the node be inserted
)
local parent = self
child._parent = parent
fix_parents(child)
if position then
table.insert(parent._children, position, child)
else
table.insert(parent._children, child)
end
end
--- Create copy of the current node
-- @return DOM_Object element
function DOM_Object:copy_node(
element --- [optional] element to be copied
)
local element = element or self
local t = {}
for k, v in pairs(element) do
if type(v) == "table" and k~="_parent" then
t[k] = self:copy_node(v)
else
t[k] = v
end
end
save_methods(t)
return t
end
--- Create a new element
-- @return DOM_Object element
function DOM_Object:create_element(
name, -- New tag name
attributes, -- Table with attributes
parent -- [optional] element which should be saved as the element's parent
)
local parent = parent or self
local new = {}
new._type = "ELEMENT"
new._name = name
new._attr = attributes or {}
new._children = {}
new._parent = parent
save_methods(new)
return new
end
--- Create new text node
-- @return DOM_Object text object
function DOM_Object:create_text_node(
text, -- string
parent -- [optional] element which should be saved as the element's parent
)
local parent = parent or self
local new = {}
new._type = "TEXT"
new._parent = parent
new._text = text
save_methods(new)
return new
end
--- Delete current node
function DOM_Object:remove_node(
element -- [optional] element to be removed
)
local element = element or self
local parent = self:get_parent(element)
local pos = self:find_element_pos(element)
-- if pos then table.remove(parent._children, pos) end
if pos then
-- table.remove(parent._children, pos)
parent._children[pos] = setmetatable({_type = "removed"}, DOM_Object)
end
end
--- Find the element position in the current node list
-- @return integer position of the current element in the element table
function DOM_Object:find_element_pos(
el -- [optional] element which should be looked up
)
local el = el or self
local parent = self:get_parent(el)
if not self:is_element(parent) and self:get_node_type(parent) ~= "ROOT" then return nil, "The parent isn't element" end
for i, x in ipairs(parent._children) do
if x == el then return i end
end
return false, "Cannot find element"
end
--- Get node list which current node is part of
-- @return table with elements
function DOM_Object:get_siblings(
el -- [optional] element for which the sibling element list should be retrieved
)
local el = el or self
local parent = el:get_parent()
if parent:is_element() then
return parent:get_children()
end
end
--- Get sibling node of the current node
-- @param change Distance from the current node
-- @return DOM_Object node
function DOM_Object:get_sibling_node( change)
local el = self
local pos = el:find_element_pos()
local siblings = el:get_siblings()
if pos and siblings then
return siblings[pos + change]
end
end
--- Get next node
-- @return DOM_Object node
function DOM_Object:get_next_node(
el --- [optional] node to be used
)
local el = el or self
return el:get_sibling_node(1)
end
--- Get previous node
-- @return DOM_Object node
function DOM_Object:get_prev_node(
el -- [optional] node to be used
)
local el = el or self
return el:get_sibling_node(-1)
end
--- parse string as HTML or XML and return created elements
--- @return table elements
function DOM_Object:create_template(
str,
is_xml
)
-- <> is a dummy element, we just need to wrap everything in some element
str = "<>" .. (str or "") .. "</>"
local template = is_xml and parse(str) or parse(str)
local root = template:root_node()._children[1]
return root
end
--- parse string as HTML or XML and insert it as a child of the current node
function DOM_Object:inner_html(
str, --- HTML or XML to be inserted
is_xml --- [optional] Pass true to parse as XML, otherwise parse as HTML
)
local el = self
local root = self:create_template(str, is_xml)
-- replace original children of the current element with children of the dummy element created by parsing
el._children = root._children
return el
end
--- parse string as HTML or XML and insert it before current the element
function DOM_Object:insert_before_begin(
str, --- HTML or XML to be inserted
is_xml --- [optional] Pass true to parse as XML, otherwise parse as HTML
)
local el = self
local root = self:create_template(str, is_xml)
local parent = el:get_parent()
local current_pos = el:find_element_pos()
local children = root:get_children()
for i = 1, #children do
parent:add_child_node(children[i], current_pos + i - 1)
end
end
--- parse string as HTML or XML and insert it at the beginning of the current the element
function DOM_Object:insert_after_begin(
str, --- HTML or XML to be inserted
is_xml --- [optional] Pass true to parse as XML, otherwise parse as HTML
)
local el = self
local root = self:create_template(str, is_xml)
local children = root:get_children()
for i = 1, #children do
el:add_child_node(children[i], i)
end
end
--- parse string as HTML or XML and insert it at the end of the current the element
function DOM_Object:insert_before_end(
str, --- HTML or XML to be inserted
is_xml --- [optional] Pass true to parse as XML, otherwise parse as HTML
)
local el = self
local root = self:create_template(str, is_xml)
local children = root:get_children()
for i = 1, #children do
el:add_child_node(children[i])
end
end
--- parse string as HTML or XML and insert it after current the element
function DOM_Object:insert_after_end(
str, --- HTML or XML to be inserted
is_xml --- [optional] Pass true to parse as XML, otherwise parse as HTML
)
local el = self
local root = self:create_template(str, is_xml)
local parent = el:get_parent()
local current_pos = el:find_element_pos()
local children = root:get_children()
for i = 1, #children do
parent:add_child_node(children[i], current_pos + i)
end
end
-- include the methods to all xml nodes
save_methods(parser._handler.root)
-- parser:
return parser, DOM_Object
end
-- table of elements that should be kept without XML escaping in the DOM serialization
local verbatim_elements = {script=true, style=true}
function html_to_dom(html_object)
-- convert parsed HTML DOM to the XML DOM
local dom, DOM_Object = parse("") -- use empty text to just initialize the DOM object
-- use root of the DOM object as the original parent
local current_parent = dom._handler.root
local function create_node(tbl)
-- create node suitable for LuaXML DOM object
tbl._children = {}
-- this should copy methods from the DOM object to the newly created object
tbl.__index = DOM_Object
return setmetatable(tbl, DOM_Object)
end
local function build_tree(object)
-- convert tree produced by the HTML parser to LuaXML DOM
local typ = object._type
-- process particular node types from the HTML parser
if typ == "doctype" then
current_parent:add_child_node(create_node {_name=object.name, _type="DTD"})
elseif typ == "comment" then
current_parent:add_child_node(create_node {_text=object.text, _type="COMMENT"})
elseif typ == "element" then
local attributes = {}
-- convert attributes to the form expected by the DOM object
for _, attr in ipairs(object.attr) do
attributes[attr.name] = attr.value
end
local element = current_parent:create_element(object.tag, attributes)
-- disable escaping of text in dom:serialize() for <script> or <style> elements
if verbatim_elements[string.lower(object.tag)] then element.verbatim = true end
current_parent:add_child_node(element)
-- set the current element as parent for the processing of children
local old_parent = current_parent
current_parent = element
-- process children
for k,v in ipairs(object.children) do
build_tree(v)
end
-- restore original parent
current_parent = old_parent
elseif typ == "text" then
local text = current_parent:create_text_node(object.text)
current_parent:add_child_node(text)
else
-- for other node types, just process the children
for k,v in ipairs(object.children) do
build_tree(v)
end
end
end
build_tree(html_object)
return dom
end
--- Parse HTML text as a DOM object.
-- It supports all methods as the object returned by the parse() function.
-- @param html_str string with the HTML code to be parsed
-- @return DOM_Object
function html_parse(html_str)
local html_obj = HtmlParser:init(html_str)
local html_dom = html_obj:parse()
return html_to_dom(html_dom)
end
--- @export
return {
parse = parse,
serialize_dom= serialize_dom,
html_parse = html_parse
}