- avoid linking to non-existent pages

- improve crawling for category page content
This commit is contained in:
Benjamin Schoch 2022-08-30 15:20:58 +02:00 committed by Avelino
parent 96677e7650
commit 8b0532228a

View File

@ -3,7 +3,6 @@ package main
import ( import (
"bytes" "bytes"
"fmt" "fmt"
"io/ioutil"
"log" "log"
"os" "os"
"strings" "strings"
@ -27,8 +26,11 @@ type Object struct {
} }
func main() { func main() {
GenerateHTML() err := GenerateHTML()
input, err := ioutil.ReadFile("./tmpl/index.html") if err != nil {
panic(err)
}
input, err := os.ReadFile("./tmpl/index.html")
if err != nil { if err != nil {
panic(err) panic(err)
} }
@ -38,22 +40,28 @@ func main() {
panic(err) panic(err)
} }
objs := []Object{} objs := make(map[string]*Object)
query.Find("body #content ul ul").First().Each(func(_ int, s *goquery.Selection) { query.Find("body #content ul ul").First().Each(func(_ int, s *goquery.Selection) {
s.Find("li a").Each(func(_ int, s *goquery.Selection) { s.Find("li a").Each(func(_ int, s *goquery.Selection) {
selector, _ := s.Attr("href") selector, exists := s.Attr("href")
if !exists {
return
}
obj := makeObjById(selector, query.Find("body")) obj := makeObjById(selector, query.Find("body"))
objs = append(objs, obj) if obj == nil {
return
}
objs[selector] = obj
}) })
}) })
makeSiteStruct(objs) makeCategoryPages(objs)
linkCategoryPagesInIndex(string(input), query, objs)
makeSitemap(objs) makeSitemap(objs)
changeLinksInIndex(string(input), query)
} }
func makeSiteStruct(objs []Object) { func makeCategoryPages(objs map[string]*Object) {
for _, obj := range objs { for _, obj := range objs {
folder := fmt.Sprintf("tmpl/%s", obj.Slug) folder := fmt.Sprintf("tmpl/%s", obj.Slug)
err := os.Mkdir(folder, 0755) err := os.Mkdir(folder, 0755)
@ -67,16 +75,16 @@ func makeSiteStruct(objs []Object) {
} }
} }
func makeSitemap(objs []Object) { func makeSitemap(objs map[string]*Object) {
t := template.Must(template.ParseFiles("tmpl/sitemap-tmpl.xml")) t := template.Must(template.ParseFiles("tmpl/sitemap-tmpl.xml"))
f, _ := os.Create("tmpl/sitemap.xml") f, _ := os.Create("tmpl/sitemap.xml")
t.Execute(f, objs) t.Execute(f, objs)
} }
func makeObjById(selector string, s *goquery.Selection) (obj Object) { func makeObjById(selector string, s *goquery.Selection) (obj *Object) {
s.Find(selector).Each(func(_ int, s *goquery.Selection) { s.Find(selector).Each(func(_ int, s *goquery.Selection) {
desc := s.NextFiltered("p") desc := s.NextFiltered("p")
ul := desc.NextFiltered("ul") ul := s.NextFilteredUntil("ul", "h2")
links := []Link{} links := []Link{}
ul.Find("li").Each(func(_ int, s *goquery.Selection) { ul.Find("li").Each(func(_ int, s *goquery.Selection) {
@ -88,7 +96,7 @@ func makeObjById(selector string, s *goquery.Selection) (obj Object) {
} }
links = append(links, link) links = append(links, link)
}) })
obj = Object{ obj = &Object{
Slug: slug.Generate(s.Text()), Slug: slug.Generate(s.Text()),
Title: s.Text(), Title: s.Text(),
Description: desc.Text(), Description: desc.Text(),
@ -98,17 +106,24 @@ func makeObjById(selector string, s *goquery.Selection) (obj Object) {
return return
} }
func changeLinksInIndex(html string, query *goquery.Document) { func linkCategoryPagesInIndex(html string, query *goquery.Document, objs map[string]*Object) {
query.Find("body #content ul li ul li a").Each(func(_ int, s *goquery.Selection) { query.Find("body #content ul li ul li a").Each(func(_ int, s *goquery.Selection) {
href, hrefExists := s.Attr("href")
if !hrefExists {
return
}
// do not replace links if no page has been created for it
_, objExists := objs[href]
if !objExists {
return
}
href, exists := s.Attr("href")
if exists {
uri := strings.SplitAfter(href, "#") uri := strings.SplitAfter(href, "#")
if len(uri) >= 2 && uri[1] != "contents" { if len(uri) >= 2 && uri[1] != "contents" {
html = strings.ReplaceAll( html = strings.ReplaceAll(
html, fmt.Sprintf(`href="%s"`, href), fmt.Sprintf(`href="%s"`, uri[1])) html, fmt.Sprintf(`href="%s"`, href), fmt.Sprintf(`href="%s"`, uri[1]))
} }
}
}) })
os.WriteFile("./tmpl/index.html", []byte(html), 0644) os.WriteFile("./tmpl/index.html", []byte(html), 0644)