From 3b5ed071b3c4f234cf1b1c5af02661958c8515ae Mon Sep 17 00:00:00 2001 From: Jacky Zhao Date: Mon, 27 Dec 2021 16:19:05 -0500 Subject: [PATCH] minor file refactoring --- contentIndex.yaml | 45 ++++++++++ go.mod | 1 + go.sum | 2 + linkIndex.yaml | 6 ++ main.go | 216 ++-------------------------------------------- parse.go | 54 ++++++++++++ util.go | 43 +++++++++ walk.go | 74 ++++++++++++++++ write.go | 69 +++++++++++++++ 9 files changed, 300 insertions(+), 210 deletions(-) create mode 100644 contentIndex.yaml create mode 100644 linkIndex.yaml create mode 100644 parse.go create mode 100644 util.go create mode 100644 walk.go create mode 100644 write.go diff --git a/contentIndex.yaml b/contentIndex.yaml new file mode 100644 index 0000000..b7ebcd8 --- /dev/null +++ b/contentIndex.yaml @@ -0,0 +1,45 @@ +# THIS FILE WAS GENERATED USING github.com/jackyzha0/hugo-obsidian +# DO NOT EDIT +README: + title: Untitled Page + content: | + # Obsidian Link Scrapper + Used by [Quartz](https://github.com/jackyzha0/quartz) + + This repository comes to you in two parts. + + 1. GitHub Action (scrapes links into a `.yml` file) + 2. Hugo Partial (turns `.yml` file into graphs and tables) + + ## GitHub Action + GitHub action and binary to scrape [Obsidian](http://obsidian.md/) vault for links and exposes them as a `.yml` file for easy consumption by [Hugo](https://gohugo.io/). + ### Example Usage (Binary) + Read Markdown from the `/content` folder and place the resulting `linkIndex.yaml` (and `contentIndex.yaml` if the `index` flag is enabled) into `/data` + + ```shell + # Installation + go install github.com/jackyzha0/hugo-obsidian + + # Run + hugo-obsidian -input=content -output=data -index=true + ``` + + ### Example Usage (GitHub Action) + + Add 'Build Link Index' as a build step in your workflow file (e.g. `.github/workflows/deploy.yaml`) + ```yaml + ... + + jobs: + deploy: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + - name: Build Link Index + uses: jackyzha0/hugo-obsidian@v2.1 + with: + input: content # input folder + output: data # output folder + index: true # whether to index content + ... + ``` diff --git a/go.mod b/go.mod index d3ec720..81f6c78 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/jackyzha0/hugo-obsidian go 1.16 require ( + github.com/BurntSushi/toml v0.4.1 // indirect github.com/PuerkitoBio/goquery v1.8.0 github.com/abhinav/goldmark-wikilink v0.3.0 github.com/gernest/front v0.0.0-20210301115436-8a0b0a782d0a diff --git a/go.sum b/go.sum index 2377062..c04e032 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/BurntSushi/toml v0.4.1 h1:GaI7EiDXDRfa8VshkTj7Fym7ha+y8/XxIgD2okUIjLw= +github.com/BurntSushi/toml v0.4.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= github.com/abhinav/goldmark-wikilink v0.3.0 h1:ry8CBaULn410PKCSkwLz/WVI2f/g7EB+yqY7LKHDcPQ= diff --git a/linkIndex.yaml b/linkIndex.yaml new file mode 100644 index 0000000..5fc0945 --- /dev/null +++ b/linkIndex.yaml @@ -0,0 +1,6 @@ +# THIS FILE WAS GENERATED USING github.com/jackyzha0/hugo-obsidian +# DO NOT EDIT +index: + links: {} + backlinks: {} +links: [] diff --git a/main.go b/main.go index 86571a5..f74f9e7 100644 --- a/main.go +++ b/main.go @@ -1,24 +1,13 @@ package main import ( - "bytes" "flag" - "fmt" - "github.com/gernest/front" - "gopkg.in/yaml.v3" - "io/fs" - "io/ioutil" - "path" - "path/filepath" - "strings" - - "github.com/PuerkitoBio/goquery" wikilink "github.com/abhinav/goldmark-wikilink" "github.com/yuin/goldmark" ) - var md goldmark.Markdown + func init() { md = goldmark.New( goldmark.WithExtensions(&wikilink.Extender{}), @@ -33,216 +22,23 @@ type Link struct { type LinkTable = map[string][]Link type Index struct { - Links LinkTable + Links LinkTable Backlinks LinkTable } type Content struct { - Title string + Title string Content string } type ContentIndex = map[string]Content -func trim(source, prefix, suffix string) string { - return strings.TrimPrefix(strings.TrimSuffix(source, suffix), prefix) +type IgnoredFiles struct { + } -func hugoPathTrim(source string) string { - return strings.TrimSuffix(strings.TrimSuffix(source, "/index"), "_index") -} +func getIgnoredFiles() { -func processTarget(source string) string { - if !isInternal(source) { - return source - } - if strings.HasPrefix(source, "/") { - return strings.TrimSuffix(source, ".md") - } - return "/" + strings.TrimSuffix(strings.TrimSuffix(source, ".html"), ".md") -} - -func isInternal(link string) bool { - return !strings.HasPrefix(link, "http") -} - -// parse single file for links -func parse(dir, pathPrefix string) []Link { - // read file - source, err := ioutil.ReadFile(dir) - if err != nil { - panic(err) - } - - // parse md - var links []Link - fmt.Printf("[Parsing note] %s\n", trim(dir, pathPrefix, ".md")) - - var buf bytes.Buffer - if err := md.Convert(source, &buf); err != nil { - panic(err) - } - - doc, err := goquery.NewDocumentFromReader(&buf) - var n int - doc.Find("a").Each(func(i int, s *goquery.Selection) { - text := strings.TrimSpace(s.Text()) - target, ok := s.Attr("href") - if !ok { - target = "#" - } - - target = strings.Replace(target, "%20", " ", -1) - target = strings.Split(processTarget(target), "#")[0] - target = strings.TrimSpace(target) - target = strings.Replace(target, " ", "-", -1) - - fmt.Printf(" '%s' => %s\n", text, target) - links = append(links, Link{ - Source: filepath.ToSlash(hugoPathTrim(trim(dir, pathPrefix, ".md"))), - Target: target, - Text: text, - }) - n++ - }) - fmt.Printf(":: %d links\n", n) - - return links -} - -func getText(dir string) string { - // read file - bytes, err := ioutil.ReadFile(dir) - if err != nil { - panic(err) - } - - return string(bytes) -} - -// recursively walk directory and return all files with given extension -func walk(root, ext string, index bool) (res []Link, i ContentIndex) { - println(root) - i = make(ContentIndex) - - m := front.NewMatter() - m.Handle("---", front.YAMLHandler) - nPrivate := 0 - - err := filepath.WalkDir(root, func(s string, d fs.DirEntry, e error) error { - if e != nil { - return e - } - if filepath.Ext(d.Name()) == ext { - res = append(res, parse(s, root)...) - if index { - text := getText(s) - - frontmatter, body, err := m.Parse(strings.NewReader(text)) - if err != nil { - frontmatter = map[string]interface{}{} - body = text - } - - var title string - if parsedTitle, ok := frontmatter["title"]; ok { - title = parsedTitle.(string) - } else { - title = "Untitled Page" - } - - // check if page is private - if parsedPrivate, ok := frontmatter["draft"]; !ok || !parsedPrivate.(bool) { - adjustedPath := strings.Replace(hugoPathTrim(trim(s, root, ".md")), " ", "-", -1) - i[adjustedPath] = Content{ - Title: title, - Content: body, - } - } else { - nPrivate++ - } - } - } - return nil - }) - if err != nil { - panic(err) - } - fmt.Printf("Ignored %d private files \n", nPrivate) - fmt.Printf("Parsed %d total links \n", len(res)) - return res, i -} - -// filter out certain links (e.g. to media) -func filter(links []Link) (res []Link) { - for _, l := range links { - // filter external and non-md - isMarkdown := filepath.Ext(l.Target) == "" || filepath.Ext(l.Target) == ".md" - if isInternal(l.Target) && isMarkdown { - res = append(res, l) - } - } - fmt.Printf("Removed %d external and non-markdown links\n", len(links) - len(res)) - return res -} - -// constructs index from links -func index(links []Link) (index Index) { - linkMap := make(map[string][]Link) - backlinkMap := make(map[string][]Link) - for _, l := range links { - // backlink (only if internal) - if _, ok := backlinkMap[l.Target]; ok { - backlinkMap[l.Target] = append(backlinkMap[l.Target], l) - } else { - backlinkMap[l.Target] = []Link{l} - } - - // regular link - if _, ok := linkMap[l.Source]; ok { - linkMap[l.Source] = append(linkMap[l.Source], l) - } else { - linkMap[l.Source] = []Link{l} - } - } - index.Links = linkMap - index.Backlinks = backlinkMap - return index -} - -const message = "# THIS FILE WAS GENERATED USING github.com/jackyzha0/hugo-obsidian\n# DO NOT EDIT\n" -func write(links []Link, contentIndex ContentIndex, toIndex bool, out string) error { - index := index(links) - resStruct := struct{ - Index Index - Links []Link - }{ - Index: index, - Links: links, - } - marshalledIndex, mErr := yaml.Marshal(&resStruct) - if mErr != nil { - return mErr - } - - writeErr := ioutil.WriteFile(path.Join(out, "linkIndex.yaml"), append([]byte(message), marshalledIndex...), 0644) - if writeErr != nil { - return writeErr - } - - if toIndex { - marshalledContentIndex, mcErr := yaml.Marshal(&contentIndex) - if mcErr != nil { - return mcErr - } - - writeErr = ioutil.WriteFile(path.Join(out, "contentIndex.yaml"), append([]byte(message), marshalledContentIndex...), 0644) - if writeErr != nil { - return writeErr - } - } - - return nil } func main() { diff --git a/parse.go b/parse.go new file mode 100644 index 0000000..0583c6b --- /dev/null +++ b/parse.go @@ -0,0 +1,54 @@ +package main + +import ( + "bytes" + "fmt" + "github.com/PuerkitoBio/goquery" + "io/ioutil" + "path/filepath" + "strings" +) + +// parse single file for links +func parse(dir, pathPrefix string) []Link { + // read file + source, err := ioutil.ReadFile(dir) + if err != nil { + panic(err) + } + + // parse md + var links []Link + fmt.Printf("[Parsing note] %s\n", trim(dir, pathPrefix, ".md")) + + var buf bytes.Buffer + if err := md.Convert(source, &buf); err != nil { + panic(err) + } + + doc, err := goquery.NewDocumentFromReader(&buf) + var n int + doc.Find("a").Each(func(i int, s *goquery.Selection) { + text := strings.TrimSpace(s.Text()) + target, ok := s.Attr("href") + if !ok { + target = "#" + } + + target = strings.Replace(target, "%20", " ", -1) + target = strings.Split(processTarget(target), "#")[0] + target = strings.TrimSpace(target) + target = strings.Replace(target, " ", "-", -1) + + fmt.Printf(" '%s' => %s\n", text, target) + links = append(links, Link{ + Source: filepath.ToSlash(hugoPathTrim(trim(dir, pathPrefix, ".md"))), + Target: target, + Text: text, + }) + n++ + }) + fmt.Printf(" Found: %d links\n", n) + + return links +} diff --git a/util.go b/util.go new file mode 100644 index 0000000..ab93bd6 --- /dev/null +++ b/util.go @@ -0,0 +1,43 @@ +package main + +import ( + "fmt" + "path/filepath" + "strings" +) + +func trim(source, prefix, suffix string) string { + return strings.TrimPrefix(strings.TrimSuffix(source, suffix), prefix) +} + +func hugoPathTrim(source string) string { + return strings.TrimSuffix(strings.TrimSuffix(source, "/index"), "_index") +} + +func processTarget(source string) string { + if !isInternal(source) { + return source + } + if strings.HasPrefix(source, "/") { + return strings.TrimSuffix(source, ".md") + } + return "/" + strings.TrimSuffix(strings.TrimSuffix(source, ".html"), ".md") +} + +func isInternal(link string) bool { + return !strings.HasPrefix(link, "http") +} + +// filter out certain links (e.g. to media) +func filter(links []Link) (res []Link) { + for _, l := range links { + // filter external and non-md + isMarkdown := filepath.Ext(l.Target) == "" || filepath.Ext(l.Target) == ".md" + if isInternal(l.Target) && isMarkdown { + res = append(res, l) + } + } + fmt.Printf("Removed %d external and non-markdown links\n", len(links)-len(res)) + return res +} + diff --git a/walk.go b/walk.go new file mode 100644 index 0000000..96cb96f --- /dev/null +++ b/walk.go @@ -0,0 +1,74 @@ +package main + +import ( + "fmt" + "github.com/gernest/front" + "io/fs" + "io/ioutil" + "path/filepath" + "strings" +) + +// recursively walk directory and return all files with given extension +func walk(root, ext string, index bool) (res []Link, i ContentIndex) { + fmt.Printf("Scraping %s\n", root) + i = make(ContentIndex) + + m := front.NewMatter() + m.Handle("---", front.YAMLHandler) + nPrivate := 0 + + err := filepath.WalkDir(root, func(s string, d fs.DirEntry, e error) error { + if e != nil { + return e + } + if filepath.Ext(d.Name()) == ext { + res = append(res, parse(s, root)...) + if index { + text := getText(s) + + frontmatter, body, err := m.Parse(strings.NewReader(text)) + if err != nil { + frontmatter = map[string]interface{}{} + body = text + } + + var title string + if parsedTitle, ok := frontmatter["title"]; ok { + title = parsedTitle.(string) + } else { + title = "Untitled Page" + } + + // check if page is private + if parsedPrivate, ok := frontmatter["draft"]; !ok || !parsedPrivate.(bool) { + adjustedPath := strings.Replace(hugoPathTrim(trim(s, root, ".md")), " ", "-", -1) + i[adjustedPath] = Content{ + Title: title, + Content: body, + } + } else { + nPrivate++ + } + } + } + return nil + }) + if err != nil { + panic(err) + } + fmt.Printf("Ignored %d private files \n", nPrivate) + fmt.Printf("Parsed %d total links \n", len(res)) + return res, i +} + +func getText(dir string) string { + // read file + fileBytes, err := ioutil.ReadFile(dir) + if err != nil { + panic(err) + } + + return string(fileBytes) +} + diff --git a/write.go b/write.go new file mode 100644 index 0000000..6f166b3 --- /dev/null +++ b/write.go @@ -0,0 +1,69 @@ +package main + +import ( + "gopkg.in/yaml.v3" + "io/ioutil" + "path" +) + +const message = "# THIS FILE WAS GENERATED USING github.com/jackyzha0/hugo-obsidian\n# DO NOT EDIT\n" +func write(links []Link, contentIndex ContentIndex, toIndex bool, out string) error { + index := index(links) + resStruct := struct { + Index Index + Links []Link + }{ + Index: index, + Links: links, + } + marshalledIndex, mErr := yaml.Marshal(&resStruct) + if mErr != nil { + return mErr + } + + writeErr := ioutil.WriteFile(path.Join(out, "linkIndex.yaml"), append([]byte(message), marshalledIndex...), 0644) + if writeErr != nil { + return writeErr + } + + if toIndex { + marshalledContentIndex, mcErr := yaml.Marshal(&contentIndex) + if mcErr != nil { + return mcErr + } + + writeErr = ioutil.WriteFile(path.Join(out, "contentIndex.yaml"), append([]byte(message), marshalledContentIndex...), 0644) + if writeErr != nil { + return writeErr + } + } + + return nil +} + +// constructs index from links +func index(links []Link) (index Index) { + linkMap := make(map[string][]Link) + backlinkMap := make(map[string][]Link) + for _, l := range links { + // backlink (only if internal) + if _, ok := backlinkMap[l.Target]; ok { + backlinkMap[l.Target] = append(backlinkMap[l.Target], l) + } else { + backlinkMap[l.Target] = []Link{l} + } + + // regular link + if _, ok := linkMap[l.Source]; ok { + linkMap[l.Source] = append(linkMap[l.Source], l) + } else { + linkMap[l.Source] = []Link{l} + } + } + index.Links = linkMap + index.Backlinks = backlinkMap + return index +} + + +