Skip to content

Commit

Permalink
Merge pull request #174 from NetSepio/vaibhavvvvv/dev
Browse files Browse the repository at this point in the history
summary: scraping urls from web to find terms of use and privacy policy and generate its summary
  • Loading branch information
vaibhavvvvv authored May 30, 2024
2 parents c845c60 + ec24d08 commit dc430d1
Show file tree
Hide file tree
Showing 4 changed files with 462 additions and 87 deletions.
44 changes: 44 additions & 0 deletions api/v1/summary/create_summary.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package summary

import (
"context"
"fmt"
"strings"

"github.com/NetSepio/gateway/config/envconfig"
openai "github.com/sashabaranov/go-openai"
)

func summarizeContent(contents []string) string {
if len(contents) == 0 {
return ""
}
var builder strings.Builder
for _, content := range contents {
builder.WriteString(content)
builder.WriteString("\n")
}

prompt := builder.String()
if len(prompt) > 128000 {
prompt = prompt[:127999]
}

open_ai_key := envconfig.EnvVars.OPENAI_API_KEY

client := openai.NewClient(open_ai_key)

req := openai.CompletionRequest{
Model: "gpt-4-turbo",
Prompt: "Summarize the following in key points under 150 words:\n\n" + prompt,
MaxTokens: 150,
}

summary, err := client.CreateCompletion(context.Background(), req)
if err != nil {
fmt.Println("error:", err)
return "Failed to summarize content"
}

return summary.Choices[0].Text
}
155 changes: 155 additions & 0 deletions api/v1/summary/scrape_urls.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package summary

import (
"context"
"fmt"
"strings"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
)

func extractLinksFromURL(url string) ([]string, error) {
opts := append(

Check failure on line 14 in api/v1/summary/scrape_urls.go

View workflow job for this annotation

GitHub Actions / Run go vet and staticcheck

x = append(y) is equivalent to x = y (SA4021)
chromedp.DefaultExecAllocatorOptions[:],

// chromedp.DefaultExecAllocatorOptions[3:],
// chromedp.NoFirstRun,
// chromedp.NoDefaultBrowserCheck,
)

parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()
ctx, cancel := chromedp.NewContext(parentCtx)
defer cancel()
var links []string

if err := chromedp.Run(ctx, navigateToWebsite(url)); err != nil {
return nil, fmt.Errorf("failed to navigate to website: %v", err)
}

if err := chromedp.Run(ctx, chromedp.Sleep(3*time.Second)); err != nil {
return nil, fmt.Errorf("failed to wait: %v", err)
}

if err := chromedp.Run(ctx, extractLinks(&links)); err != nil {
return nil, fmt.Errorf("failed to extract links: %v", err)
}

return links, nil
}

func navigateToWebsite(url string) chromedp.Action {
return chromedp.Navigate(url)
}

func extractLinks(links *[]string) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Run(ctx, chromedp.Nodes("a", &nodes)); err != nil {
return err
}

for _, node := range nodes {
href := node.AttributeValue("href")
if href != "" {
*links = append(*links, href)
}
}

return nil
})
}

func extractContentFromLink(link string) (string, error) {
opts := append(

Check failure on line 66 in api/v1/summary/scrape_urls.go

View workflow job for this annotation

GitHub Actions / Run go vet and staticcheck

x = append(y) is equivalent to x = y (SA4021)
chromedp.DefaultExecAllocatorOptions[:],

// chromedp.DefaultExecAllocatorOptions[3:],
// chromedp.NoFirstRun,
// chromedp.NoDefaultBrowserCheck,
)

parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()
ctx, cancel := chromedp.NewContext(parentCtx)
defer cancel()

var content string

if err := chromedp.Run(ctx, navigateToWebsite(link)); err != nil {
return "", fmt.Errorf("failed to navigate to link: %v", err)
}

if err := chromedp.Run(ctx, chromedp.Sleep(3*time.Second)); err != nil {
return "", fmt.Errorf("failed to wait: %v", err)
}

if err := chromedp.Run(ctx, extractText(&content)); err != nil {
return "", fmt.Errorf("failed to extract text content: %v", err)
}
fmt.Println(content)

return content, nil
}

func extractText(content *string) chromedp.Action {
return chromedp.ActionFunc(func(ctx context.Context) error {
var node string
if err := chromedp.Run(ctx, chromedp.Text("body", &node, chromedp.NodeVisible, chromedp.ByQuery)); err != nil {
return err
}

*content = node
return nil
})
}

func containsTermsKeywords(link string) bool {
termsKeywords := []string{"terms", "tos", "terms-of-service", "eula", "end-user-license-agreement", "termsofuse"}
link = strings.ToLower(link)
for _, keyword := range termsKeywords {
if strings.Contains(link, keyword) {
return true
}
}
return false
}

func containsPrivacyKeywords(link string) bool {
privacyKeywords := []string{"privacy", "privacy-policy", "policy", "policies"}
link = strings.ToLower(link)
for _, keyword := range privacyKeywords {
if strings.Contains(link, keyword) {
return true
}
}
return false
}

func summarizeLinksContent(links []string) (termsSummary, privacySummary string) {
var termsContents, privacyContents []string

for _, link := range links {
if containsTermsKeywords(link) {
content, err := extractContentFromLink(link)
if err == nil {
termsContents = append(termsContents, content)
fmt.Println(termsContents)
}
}
if containsPrivacyKeywords(link) {
content, err := extractContentFromLink(link)
if err == nil {
privacyContents = append(privacyContents, content)
fmt.Println(privacyContents)
}
}
}

termsSummary = summarizeContent(termsContents)
privacySummary = summarizeContent(privacyContents)

return termsSummary, privacySummary
}
Loading

0 comments on commit dc430d1

Please sign in to comment.