Модуль:HTML

Источник: https://github.com/soulik/lpeg-grammars


table#1 {
  ["doctype"] = table#2 {
    ["attributes"] = table#3 {
      ["html"] = true,
    },
  },
  ["elements"] = table#4 {
    table#5 {
      ["elementName"] = "html",
      ["elements"] = table#6 {
        table#7 {
          ["elementName"] = "head",
          ["elements"] = table#8 {
            table#9 {
              ["elementName"] = "title",
              ["elements"] = table#10 {
                table#11 {
                  ["text"] = "A sample code",
                  ["type"] = "text",
                },
              },
              ["type"] = "tag",
            },
            table#12 {
              ["attributes"] = table#13 {
                ["charset"] = "UTF-8",
              },
              ["elementName"] = "meta",
              ["type"] = "tag",
            },
          },
          ["type"] = "tag",
        },
        table#14 {
          ["elementName"] = "body",
          ["elements"] = table#15 {
            table#16 {
              ["elementName"] = "h1",
              ["elements"] = table#17 {
                table#18 {
                  ["text"] = "Hello world!",
                  ["type"] = "text",
                },
              },
              ["type"] = "tag",
            },
            table#19 {
              ["elementName"] = "br",
              ["type"] = "tag",
            },
          },
          ["type"] = "tag",
        },
      },
      ["type"] = "tag",
    },
  },
}

-- https://github.com/soulik/lpeg-grammars/blob/master/lexers/html.lua

local lpeg = lpeg or require 'lpeg'
local locale = lpeg.locale()

local M = {}
local P = lpeg.P
local S = lpeg.S
local R = lpeg.R
local C = lpeg.C
local Cg = lpeg.Cg
local Cc = lpeg.Cc
local Cb = lpeg.Cb
local Ct = lpeg.Ct
local Cf = lpeg.Cf
local Cmt = lpeg.Cmt
local V = lpeg.V
local space = locale.space
local word = locale.alnum

local lt, gt = P'<', P'>'
local lte, gte = P'</', P'/>'
local ltdt = P'<!'
local squot = P"'"
local dquot = P'"'
local cb, ce = P'<!--', P'-->'
local equal = P'='
local nameChar = word + S':_'
local name = nameChar^1
local cdataS=P'<![CDATA['
local cdataE=P']]>'
local text = (1 - lt)^1
local doctype = P'doctype' + P'DOCTYPE'

local stackMT = {
	__unm = function(t)
		return table.remove(t)
	end,
	__add = function(t, v)
		table.insert(t, v)
		return t
	end,
}

local stack = function()
	local stack = {}
	setmetatable(stack, stackMT)
	return stack
end

local D = function(name)
	return function(...)
		print(name,':')
		local t = {}
		local values = {...}
		for i,v in ipairs(values) do
			table.insert(t, ('%q'):format(tostring(v)))
		end
		print(unpack(t))
		return ...
	end
end

local function elementType(name)
	return Cg(Cc(name), 'type')
end

M.parse = function(html)
	local tagStack = stack()
	
	local openTag = function(a)
		return -tagStack
	end

	local closeTag = function(a)
		tagStack = tagStack + a
		return a
	end

    local grammar = P {
    	'htmlDocument',
    	htmlDocument = space^0 * Ct(V'doctype' * V'elements'),
    	doctype = Cg(Ct(ltdt * doctype * (V'tagAttributesList')^(-1) * space^0 * gt), 'doctype'),

    	elements =
    		space^0 *
    		Cg(
    			Ct(
    				(Ct(V'tag' + V'comment' + V'textContent' + V'CDATA'))^0
    			),
    			'elements'
    		)
    		* space^0,

    	tag = space^0 * (V'pairTag' + V'singleTag') * elementType('tag') * space^0,
    	tagName = name,

    	singleTag = lt * Cg(V'tagName', 'elementName') * (V'tagAttributesList')^(-1) * space^0 * (gte + gt),
    	
    	pairTagBegin = lt * Cg(Cg(C(V'tagName') / openTag, 'element'), 'elementName') * (V'tagAttributesList')^(-1) * gt ,
    	pairTagEnd = lte * Cmt((C(V'tagName') / closeTag) * Cb'elementName',
    		function(s, i, a, b)
    			return a==b
    		end) * gt,
    	pairTag =  V'pairTagBegin' * V'elements' * V'pairTagEnd',

    	tagAttributeName = C(name),
    	tagAttributeValue = (squot * C((1-squot)^0) * squot) + (dquot * C((1-dquot)^0) * dquot),

    	tagAttributeFlag = Cg(V'tagAttributeName' * Cc(true)),
    	tagAttributeValued = Cg(V'tagAttributeName' * equal * V'tagAttributeValue'),
    	tagAttribute = (V'tagAttributeValued' + V'tagAttributeFlag'),

    	tagAttributes = V'tagAttribute' * (space^1 * V'tagAttribute')^0,
    	tagAttributesList = space^1 * Cg(Cf(Ct'' * V'tagAttributes', function(t, k, v)
    		t[k] = v
    		return t,k,v
    	end), 'attributes') * space^0,

    	textContent = space^0 * Cg(C(text), 'text') * elementType('text') * space^0,

    	commentContent = C((1- ce)^0),
    	comment = space^0 * cb * Cg(V'commentContent', 'text') * elementType('comment') * ce * space^0,

    	CDATAcontent = C((1- cdataE)^0),
    	CDATA = space^0 * (cdataS * space^0 * Cg(V'CDATAcontent', 'text') * space^0 * cdataE) * elementType('CDATA') * space^0,
    }

	return grammar:match(html)
end

function M.test (frame)
	return mw.dumpObject (M.parse [[
<!DOCTYPE html>
<html>
	<head>
		<title>A sample code</title>
		<meta charset="UTF-8"/>
	</head>
	<body>
		<h1>Hello world!</h1><br />
	</body>
</html>]])
end

return M