Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/word #62

Merged
merged 3 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ toolchain go1.23.3
require (
github.com/PuerkitoBio/goquery v1.10.0
github.com/basenana/go-flow v0.0.0-20230801131009-d05f1f41b706
github.com/blevesearch/bleve/v2 v2.3.10
github.com/blevesearch/bleve/v2 v2.4.0
github.com/blevesearch/upsidedown_store_api v1.0.2
github.com/cdipaolo/goml v0.0.0-20220715001353-00e0c845ae1c
github.com/gin-gonic/gin v1.10.0
github.com/go-gormigrate/gormigrate/v2 v2.1.1
github.com/google/uuid v1.3.0
github.com/hyponet/jiebago v0.0.0-20240525141904-e34990856482
github.com/meilisearch/meilisearch-go v0.29.0
github.com/onsi/ginkgo v1.16.5
github.com/onsi/ginkgo/v2 v2.13.0
Expand All @@ -22,20 +23,21 @@ require (
go.uber.org/zap v1.27.0
golang.org/x/time v0.4.0
gorm.io/driver/postgres v1.5.2
gorm.io/gorm v1.25.4
gorm.io/gorm v1.25.7
)

require (
github.com/RoaringBitmap/roaring v1.2.3 // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/bits-and-blooms/bitset v1.2.0 // indirect
github.com/blevesearch/bleve_index_api v1.0.6 // indirect
github.com/blevesearch/geo v0.1.18 // indirect
github.com/blevesearch/bleve_index_api v1.1.6 // indirect
github.com/blevesearch/geo v0.1.20 // indirect
github.com/blevesearch/go-faiss v1.0.13 // indirect
github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
github.com/blevesearch/gtreap v0.1.1 // indirect
github.com/blevesearch/mmap-go v1.0.4 // indirect
github.com/blevesearch/scorch_segment_api/v2 v2.1.6 // indirect
github.com/blevesearch/scorch_segment_api/v2 v2.2.9 // indirect
github.com/blevesearch/segment v0.9.1 // indirect
github.com/blevesearch/snowballstem v0.9.0 // indirect
github.com/blevesearch/vellum v1.0.10 // indirect
Expand All @@ -44,13 +46,17 @@ require (
github.com/blevesearch/zapx/v13 v13.3.10 // indirect
github.com/blevesearch/zapx/v14 v14.3.10 // indirect
github.com/blevesearch/zapx/v15 v15.3.13 // indirect
github.com/blevesearch/zapx/v16 v16.0.12 // indirect
github.com/bytedance/sonic v1.12.5 // indirect
github.com/bytedance/sonic/loader v0.2.1 // indirect
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/fsnotify/fsnotify v1.4.9 // indirect
github.com/gabriel-vasile/mimetype v1.4.7 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/glebarez/go-sqlite v1.21.2 // indirect
github.com/glebarez/sqlite v1.11.0 // indirect
github.com/go-logr/logr v1.2.4 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
Expand Down Expand Up @@ -81,6 +87,7 @@ require (
github.com/mschoch/smat v0.2.0 // indirect
github.com/nxadm/tail v1.4.8 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/rogpeppe/go-internal v1.11.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
Expand All @@ -97,4 +104,8 @@ require (
google.golang.org/protobuf v1.35.2 // indirect
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
modernc.org/libc v1.22.5 // indirect
modernc.org/mathutil v1.5.0 // indirect
modernc.org/memory v1.5.0 // indirect
modernc.org/sqlite v1.23.1 // indirect
)
41 changes: 33 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@ github.com/basenana/go-flow v0.0.0-20230801131009-d05f1f41b706 h1:FxXoMwMZsufBjS
github.com/basenana/go-flow v0.0.0-20230801131009-d05f1f41b706/go.mod h1:Rs13PWsg/ITdXRiVJcI+yS0iqCfNHxCbIFEt5DCt/RQ=
github.com/bits-and-blooms/bitset v1.2.0 h1:Kn4yilvwNtMACtf1eYDlG8H77R07mZSPbMjLyS07ChA=
github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
github.com/blevesearch/bleve/v2 v2.3.10 h1:z8V0wwGoL4rp7nG/O3qVVLYxUqCbEwskMt4iRJsPLgg=
github.com/blevesearch/bleve/v2 v2.3.10/go.mod h1:RJzeoeHC+vNHsoLR54+crS1HmOWpnH87fL70HAUCzIA=
github.com/blevesearch/bleve_index_api v1.0.6 h1:gyUUxdsrvmW3jVhhYdCVL6h9dCjNT/geNU7PxGn37p8=
github.com/blevesearch/bleve_index_api v1.0.6/go.mod h1:YXMDwaXFFXwncRS8UobWs7nvo0DmusriM1nztTlj1ms=
github.com/blevesearch/geo v0.1.18 h1:Np8jycHTZ5scFe7VEPLrDoHnnb9C4j636ue/CGrhtDw=
github.com/blevesearch/geo v0.1.18/go.mod h1:uRMGWG0HJYfWfFJpK3zTdnnr1K+ksZTuWKhXeSokfnM=
github.com/blevesearch/bleve/v2 v2.4.0 h1:2xyg+Wv60CFHYccXc+moGxbL+8QKT/dZK09AewHgKsg=
github.com/blevesearch/bleve/v2 v2.4.0/go.mod h1:IhQHoFAbHgWKYavb9rQgQEJJVMuY99cKdQ0wPpst2aY=
github.com/blevesearch/bleve_index_api v1.1.6 h1:orkqDFCBuNU2oHW9hN2YEJmet+TE9orml3FCGbl1cKk=
github.com/blevesearch/bleve_index_api v1.1.6/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
github.com/blevesearch/go-faiss v1.0.13 h1:zfFs7ZYD0NqXVSY37j0JZjZT1BhE9AE4peJfcx/NB4A=
github.com/blevesearch/go-faiss v1.0.13/go.mod h1:jrxHrbl42X/RnDPI+wBoZU8joxxuRwedrxqswQ3xfU8=
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y=
github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk=
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
github.com/blevesearch/scorch_segment_api/v2 v2.1.6 h1:CdekX/Ob6YCYmeHzD72cKpwzBjvkOGegHOqhAkXp6yA=
github.com/blevesearch/scorch_segment_api/v2 v2.1.6/go.mod h1:nQQYlp51XvoSVxcciBjtvuHPIVjlWrN1hX4qwK2cqdc=
github.com/blevesearch/scorch_segment_api/v2 v2.2.9 h1:3nBaSBRFokjE4FtPW3eUDgcAu3KphBg1GP07zy/6Uyk=
github.com/blevesearch/scorch_segment_api/v2 v2.2.9/go.mod h1:ckbeb7knyOOvAdZinn/ASbB7EA3HoagnJkmEV3J7+sg=
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
Expand All @@ -42,6 +44,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7
github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ=
github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg=
github.com/blevesearch/zapx/v16 v16.0.12 h1:Uccxvjmn+hQ6ywQP+wIiTpdq9LnAviGoryJOmGwAo/I=
github.com/blevesearch/zapx/v16 v16.0.12/go.mod h1:MYnOshRfSm4C4drxx1LGRI+MVFByykJ2anDY1fxdk9Q=
github.com/bytedance/sonic v1.12.5 h1:hoZxY8uW+mT+OpkcUWw4k0fDINtOcVavEsGfzwzFU/w=
github.com/bytedance/sonic v1.12.5/go.mod h1:B8Gt/XvtZ3Fqj+iSKMypzymZxw/FVwgIGKzMzT9r/rk=
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
Expand All @@ -57,6 +61,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46t
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
Expand All @@ -66,6 +72,10 @@ github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
github.com/glebarez/go-sqlite v1.21.2 h1:3a6LFC4sKahUunAmynQKLZceZCOzUthkRkEAl9gAXWo=
github.com/glebarez/go-sqlite v1.21.2/go.mod h1:sfxdZyhQjTM2Wry3gVYWaW072Ri1WMdWJi0k6+3382k=
github.com/glebarez/sqlite v1.11.0 h1:wSG0irqzP6VurnMEpFGer5Li19RpIRi2qvQz++w0GMw=
github.com/glebarez/sqlite v1.11.0/go.mod h1:h8/o8j5wiAsqSPoWELDUdJXhjAhsVliSn7bWZjOhrgQ=
github.com/go-gormigrate/gormigrate/v2 v2.1.1 h1:eGS0WTFRV30r103lU8JNXY27KbviRnqqIDobW3EV3iY=
github.com/go-gormigrate/gormigrate/v2 v2.1.1/go.mod h1:L7nJ620PFDKei9QOhJzqA8kRCk+E3UbV2f5gv+1ndLc=
github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
Expand Down Expand Up @@ -111,6 +121,8 @@ github.com/google/pprof v0.0.0-20230207041349-798e818bf904/go.mod h1:uglQLonpP8q
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/hyponet/jiebago v0.0.0-20240525141904-e34990856482 h1:6phC4CbKBzBQ7PHsf3I21WLF9Fxy8feaLhXmCgh/lcs=
github.com/hyponet/jiebago v0.0.0-20240525141904-e34990856482/go.mod h1:/QDKR3hYNk0E+DX4XWJHuBsiC1UTGdGynWjps4Kj+T0=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
Expand Down Expand Up @@ -171,6 +183,9 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/redis/rueidis v1.0.0 h1:LrUhkD46Es7neMvpTgqyYGRpvlGG4F6dLIRq+nUw/ho=
github.com/redis/rueidis v1.0.0/go.mod h1:yxbpgX+VYNxCvdE0KEQXDeUFcF2hB2Oz/TJiaqFxoEU=
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
Expand Down Expand Up @@ -302,4 +317,14 @@ gorm.io/driver/postgres v1.5.2 h1:ytTDxxEv+MplXOfFe3Lzm7SjG09fcdb3Z/c056DTBx0=
gorm.io/driver/postgres v1.5.2/go.mod h1:fmpX0m2I1PKuR7mKZiEluwrP3hbs+ps7JIGMUBpCgl8=
gorm.io/gorm v1.25.4 h1:iyNd8fNAe8W9dvtlgeRI5zSVZPsq3OpcTu37cYcpCmw=
gorm.io/gorm v1.25.4/go.mod h1:L4uxeKpfBml98NYqVqwAdmV1a2nBtAec/cf3fpucW/k=
gorm.io/gorm v1.25.7 h1:VsD6acwRjz2zFxGO50gPO6AkNs7KKnvfzUjHQhZDz/A=
gorm.io/gorm v1.25.7/go.mod h1:hbnx/Oo0ChWMn1BIhpy1oYozzpM15i4YPuHDmfYtwg8=
modernc.org/libc v1.22.5 h1:91BNch/e5B0uPbJFgqbxXuOnxBQjlS//icfQEGmvyjE=
modernc.org/libc v1.22.5/go.mod h1:jj+Z7dTNX8fBScMVNRAYZ/jF91K8fdT2hYMThc3YjBY=
modernc.org/mathutil v1.5.0 h1:rV0Ko/6SfM+8G+yKiyI830l3Wuz1zRutdslNoQ0kfiQ=
modernc.org/mathutil v1.5.0/go.mod h1:mZW8CKdRPY1v87qxC/wUdX5O1qDzXMP5TH3wjfpga6E=
modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds=
modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU=
modernc.org/sqlite v1.23.1 h1:nrSBg4aRQQwq59JpvGEQ15tNxoO5pX/kUjcRNwSAGQM=
modernc.org/sqlite v1.23.1/go.mod h1:OrDj17Mggn6MhE+iPbBNf7RGKODDE9NFT0f3EwDzJqk=
nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
95 changes: 95 additions & 0 deletions pkg/dispatch/plugin/doc_process.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
Copyright 2024 Friday Author.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package plugin

import (
"bytes"
"context"
"fmt"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/hyponet/jiebago"

"github.com/basenana/friday/pkg/models/doc"
)

type DocProcessPlugin struct {
seg jiebago.Segmenter
}

func NewDocProcessPlugin() *DocProcessPlugin {
seg := jiebago.Segmenter{}
seg.LoadDictionary("dict.txt")
return &DocProcessPlugin{
seg: seg,
}
}

var _ ChainPlugin = &DocProcessPlugin{}

func (s *DocProcessPlugin) Name() string {
return "docProcess"
}

func (s *DocProcessPlugin) Run(ctx context.Context, doc *doc.Document) error {
var err error
// html analysis
doc.PureContent, err = trimContent(doc.Content)
if err != nil {
return fmt.Errorf("process doc with id %d error: %s", doc.EntryId, err)
}

// split title
doc.TitleTokens = s.splitTokens(doc.Name)

// split content
doc.ContentTokens = s.splitTokens(doc.PureContent)
return nil
}

func trimContent(content string) (string, error) {
query, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(content)))
if err != nil {
return "", err
}

query.Find("body").EachWithBreak(func(i int, selection *goquery.Selection) bool {
t := strings.TrimSpace(selection.Text())
if t != "" {
content = t
}
return true
})

content = ContentTrim("html", content)
content = strings.ReplaceAll(content, "'", "")
return content, nil
}

func (s *DocProcessPlugin) splitTokens(content string) []string {
contentCh := s.seg.CutForSearch(content, true)
tokens := make([]string, 0, len(contentCh))
for token := range contentCh {
tokens = append(tokens, token)
}
return tokens
}

func init() {
RegisterPlugin(NewDocProcessPlugin())
}
2 changes: 1 addition & 1 deletion pkg/dispatch/plugin/header.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func (h *HeaderImgPlugin) Run(ctx context.Context, doc *doc.Document) error {
var headerImgUrl string
query, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(doc.Content)))
if err != nil {
return fmt.Errorf("build doc query with id %d error: %s", doc.EntryId, err)
return fmt.Errorf("get header image of doc with id %d error: %s", doc.EntryId, err)
}

query.Find("img").EachWithBreak(func(i int, selection *goquery.Selection) bool {
Expand Down
17 changes: 0 additions & 17 deletions pkg/dispatch/plugin/subcontent.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package plugin
import (
"bytes"
"context"
"regexp"
"strings"

"github.com/PuerkitoBio/goquery"
Expand All @@ -41,22 +40,6 @@ func (s *SubContentPlugin) Run(ctx context.Context, doc *doc.Document) error {

var _ ChainPlugin = &SubContentPlugin{}

var repeatSpace = regexp.MustCompile(`\s+`)
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w:]+((\s+[\w-]+(\s*=\s*(?:\\*".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)

func ContentTrim(contentType, content string) string {
switch contentType {
case "html", "htm", "webarchive", ".webarchive":
content = strings.ReplaceAll(content, "</p>", "</p>\n")
content = strings.ReplaceAll(content, "</P>", "</P>\n")
content = strings.ReplaceAll(content, "</div>", "</div>\n")
content = strings.ReplaceAll(content, "</DIV>", "</DIV>\n")
content = htmlCharFilterRegexp.ReplaceAllString(content, "")
}
content = repeatSpace.ReplaceAllString(content, " ")
return content
}

func GenerateContentSubContent(content string) string {
if subContent, err := slowPathContentSubContent([]byte(content)); err == nil {
return subContent
Expand Down
38 changes: 38 additions & 0 deletions pkg/dispatch/plugin/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
Copyright 2024 Friday Author.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package plugin

import (
"regexp"
"strings"
)

var repeatSpace = regexp.MustCompile(`\s+`)
var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w:]+((\s+[\w-]+(\s*=\s*(?:\\*".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)

func ContentTrim(contentType, content string) string {
switch contentType {
case "html", "htm", "webarchive", ".webarchive":
content = strings.ReplaceAll(content, "</p>", "</p>\n")
content = strings.ReplaceAll(content, "</P>", "</P>\n")
content = strings.ReplaceAll(content, "</div>", "</div>\n")
content = strings.ReplaceAll(content, "</DIV>", "</DIV>\n")
content = htmlCharFilterRegexp.ReplaceAllString(content, "")
}
content = repeatSpace.ReplaceAllString(content, " ")
return content
}
3 changes: 3 additions & 0 deletions pkg/models/doc/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ type Document struct {
WebUrl string `json:"web_url,omitempty"`
HeaderImage string `json:"header_image,omitempty"`
SubContent string `json:"sub_content,omitempty"`
PureContent string `json:"pure_content,omitempty"`
TitleTokens []string `json:"title_tokens,omitempty"`
ContentTokens []string `json:"content_tokens,omitempty"`
Marked *bool `json:"marked,omitempty"`
Unread *bool `json:"unread,omitempty"`
CreatedAt time.Time `json:"created_at"`
Expand Down
9 changes: 8 additions & 1 deletion pkg/service/chain.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ func (c *Chain) CreateDocument(ctx context.Context, document *doc.Document) erro
}
}
c.Log.Debugf("create document: %+v", document.Name)
return c.DocClient.CreateDocument(ctx, document)
if err := c.DocClient.CreateDocument(ctx, document); err != nil {
c.Log.Errorf("create document error: %s", err)
return err
}
return c.DocClient.UpdateTokens(ctx, document)
})
}

Expand All @@ -110,6 +114,9 @@ func (c *Chain) GetDocument(ctx context.Context, namespace string, entryId int64
ctx = c.WithNamespace(ctx, namespace)
doc, err := c.DocClient.GetDocument(ctx, entryId)
if err != nil {
if err == models.ErrNotFound {
return nil, nil
}
c.Log.Errorf("get document error: %s", err)
return nil, err
}
Expand Down
Loading
Loading