In this post, I’ll go through the process of writing a domain-specific language (DSL) in Go.
Even though this a simple example, it can be a good starting point for anyone who wants to write their own DSL.
If you are like me, trying to make sense of how programming languages work under the hood, writing a DSL is a great way to get started. It is a fun exercise that can help you understand the basics of parsing and interpreting a language.
Before we get into the code, let’s define the language we want to implement.
My choice was to write a language that can be used to generate HTML code without using the actual HTML syntax. I called it HTnoML (Hypertext no Markup Language), and it looks like this:
{:html
{:head
{:meta charset[utf-8]}
{:title > This is a HTML5 template}
{:meta name[description] content[This is a HTML5 template]}
{:meta name[author] content[Radu Lucut]}
{:link rel[stylesheet] href[css/style.css]}
}
{:body
{class[container main] > This is HTnoML syntax}
}
}
{}represents an element. The tag name can be specified by adding a:followed by the name otherwise it defaults todiv.- attributes are separated by whitespace and can have a value inside
[]. >is followed by the text content of the element.
Now that we have the rules, let’s write the code.
We’ll start by defining the data structures we will use to represent the parsed data.
Code
type Chunk struct {
Start int
End int
}
type Attribute struct {
Name *Chunk
Value *Chunk
}
type Node struct {
ElementType *Chunk
Text *Chunk // Only for text nodes
Attributes []Attribute // Only for element nodes
Children []Node // Only for element nodes
}
type TokenType byte
const (
TokenTypeWhitespace TokenType = 1
TokenTypeIdentifier TokenType = 2
TokenTypeReserved TokenType = 3
)
type Token struct {
Type TokenType
Start int
End int
}
type Parser struct {
buf []byte
pos int
line int
col int
curr *Token
node *Node
}
To keep things simple, we’ll read the entire input into memory. This may not be the best approach for large files, but it’s good enough for our example.
Code
func NewParser(reader io.ReadSeeker) (*Parser, error) {
b, err := io.ReadAll(reader)
if err != nil {
return nil, err
}
return &Parser{
line: 1,
col: 1,
buf: b,
}, nil
}
Once we have the input in memory, we can start the parsing process. The first step is to transform the input into tokens. A token is a small piece of the input that has a specific meaning in the language. This process is called lexical analysis.
In our case, we have three types of tokens: whitespace, reserved, and identifiers.
Code
func (p *Parser) nextToken() *Token {
token := p.curr
p.curr = nil
if token == nil {
return p.readNextToken()
}
return token
}
func (p *Parser) peekToken() *Token {
if p.curr == nil {
p.curr = p.readNextToken()
}
return p.curr
}
func (p *Parser) readNextToken() *Token {
if p.pos >= len(p.buf) {
return nil
}
if p.isWhitespace(p.buf[p.pos]) {
start := p.pos
p.pos++
p.col++
for p.pos < len(p.buf) && p.isWhitespace(p.buf[p.pos]) {
p.pos++
p.col++
}
return &Token{
Type: TokenTypeWhitespace,
Start: start,
End: p.pos,
}
}
if p.isReserved(p.buf[p.pos]) {
p.pos++
p.col++
return &Token{
Type: TokenTypeReserved,
Start: p.pos - 1,
End: p.pos,
}
}
start := p.pos
for p.pos < len(p.buf) && p.isIdentifier(p.buf[p.pos]) {
p.pos++
p.col++
}
return &Token{
Type: TokenTypeIdentifier,
Start: start,
End: p.pos,
}
}
func (p *Parser) isWhitespace(b byte) bool {
switch b {
case '\n':
p.line++
p.col = 0
return true
case ' ', '\t', '\r':
return true
}
return false
}
func (p *Parser) isReserved(b byte) bool {
switch b {
case '{', '}', '[', ']', ':', '>':
return true
}
return false
}
func (p *Parser) isIdentifier(b byte) bool {
if p.isWhitespace(b) || p.isReserved(b) {
return false
}
return true
}
Once we have the tokens, we are going to build a tree structure that represents the parsed data. It gets us closer to the abstract representation of the language, will make it easier to generate the output and to extend the language in the future. This is called syntactic analysis.
Code
func (p *Parser) parse() {
p.node = &Node{}
childParentMap := map[*Node]*Node{}
curr := p.node
for p.peekToken() != nil {
t := p.nextToken()
if t.Type == TokenTypeWhitespace {
continue
}
if t.Type == TokenTypeReserved && p.buf[t.Start] == ':' {
t = p.nextToken()
curr.ElementType = &Chunk{
Start: t.Start,
End: t.End,
}
continue
}
if t.Type == TokenTypeIdentifier {
name := &Chunk{
Start: t.Start,
End: t.End,
}
var value *Chunk
t = p.nextToken()
if t.Type == TokenTypeReserved && p.buf[t.Start] == '[' {
t = p.nextToken()
start := t.Start
pt := p.peekToken()
for pt != nil && !(pt.Type == TokenTypeReserved && p.buf[pt.Start] == ']') {
t = p.nextToken()
pt = p.peekToken()
}
value = &Chunk{
Start: start,
End: t.End,
}
}
curr.Attributes = append(curr.Attributes, Attribute{
Name: name,
Value: value,
})
continue
}
if t.Type == TokenTypeReserved && p.buf[t.Start] == '{' {
curr.Children = append(curr.Children, Node{})
el := &curr.Children[len(curr.Children)-1]
childParentMap[el] = curr
curr = el
continue
}
if t.Type == TokenTypeReserved && p.buf[t.Start] == '}' {
curr = childParentMap[curr]
continue
}
if t.Type == TokenTypeReserved && p.buf[t.Start] == '>' {
t = p.nextToken()
start := t.Start
pt := p.peekToken()
for pt != nil && !(pt.Type == TokenTypeReserved && p.buf[pt.Start] == '}') {
t = p.nextToken()
pt = p.peekToken()
}
curr.Children = append(curr.Children, Node{
Text: &Chunk{
Start: start,
End: t.End,
},
})
continue
}
}
}
Finally, once the tree is built, converting it to HTML becomes an easy task. We’ll use a recursive function to traverse the tree and write the HTML to a writer.
Code
func (p *Parser) ToHTML(writer io.Writer) {
if p.node == nil {
p.parse()
}
p.nodeToHTML(writer, p.node, true)
}
func (p *Parser) nodeToHTML(w io.Writer, node *Node, isRoot bool) {
if node.Text != nil {
w.Write(p.buf[node.Text.Start:node.Text.End])
return
}
var tag []byte
if !isRoot {
w.Write([]byte{'<'})
if node.ElementType == nil {
tag = []byte{'d', 'i', 'v'}
} else {
tag = p.buf[node.ElementType.Start:node.ElementType.End]
}
w.Write(tag)
for _, attr := range node.Attributes {
w.Write([]byte{' '})
w.Write(p.buf[attr.Name.Start:attr.Name.End])
if attr.Value != nil {
w.Write([]byte{'=', '"'})
w.Write(p.buf[attr.Value.Start:attr.Value.End])
w.Write([]byte{'"'})
}
}
if len(node.Children) == 0 && isVoidElement(string(tag)) {
w.Write([]byte{' ', '/', '>'})
return
}
w.Write([]byte{'>'})
}
for _, child := range node.Children {
p.nodeToHTML(w, &child, false)
}
if !isRoot {
w.Write([]byte{'<', '/'})
w.Write(tag)
w.Write([]byte{'>'})
}
}
And that’s it! We have a DSL that can be used to generate HTML code.
Writing a DSL is a fun exercise that can help you understand the basics of parsing and interpreting a language.
references/inspiration: