-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #174 from NetSepio/vaibhavvvvv/dev
summary: scraping urls from web to find terms of use and privacy policy and generate its summary
- Loading branch information
Showing
4 changed files
with
462 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package summary | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/NetSepio/gateway/config/envconfig" | ||
openai "github.com/sashabaranov/go-openai" | ||
) | ||
|
||
func summarizeContent(contents []string) string { | ||
if len(contents) == 0 { | ||
return "" | ||
} | ||
var builder strings.Builder | ||
for _, content := range contents { | ||
builder.WriteString(content) | ||
builder.WriteString("\n") | ||
} | ||
|
||
prompt := builder.String() | ||
if len(prompt) > 128000 { | ||
prompt = prompt[:127999] | ||
} | ||
|
||
open_ai_key := envconfig.EnvVars.OPENAI_API_KEY | ||
|
||
client := openai.NewClient(open_ai_key) | ||
|
||
req := openai.CompletionRequest{ | ||
Model: "gpt-4-turbo", | ||
Prompt: "Summarize the following in key points under 150 words:\n\n" + prompt, | ||
MaxTokens: 150, | ||
} | ||
|
||
summary, err := client.CreateCompletion(context.Background(), req) | ||
if err != nil { | ||
fmt.Println("error:", err) | ||
return "Failed to summarize content" | ||
} | ||
|
||
return summary.Choices[0].Text | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
package summary | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strings" | ||
"time" | ||
|
||
"github.com/chromedp/cdproto/cdp" | ||
"github.com/chromedp/chromedp" | ||
) | ||
|
||
func extractLinksFromURL(url string) ([]string, error) { | ||
opts := append( | ||
chromedp.DefaultExecAllocatorOptions[:], | ||
|
||
// chromedp.DefaultExecAllocatorOptions[3:], | ||
// chromedp.NoFirstRun, | ||
// chromedp.NoDefaultBrowserCheck, | ||
) | ||
|
||
parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) | ||
defer cancel() | ||
ctx, cancel := chromedp.NewContext(parentCtx) | ||
defer cancel() | ||
var links []string | ||
|
||
if err := chromedp.Run(ctx, navigateToWebsite(url)); err != nil { | ||
return nil, fmt.Errorf("failed to navigate to website: %v", err) | ||
} | ||
|
||
if err := chromedp.Run(ctx, chromedp.Sleep(3*time.Second)); err != nil { | ||
return nil, fmt.Errorf("failed to wait: %v", err) | ||
} | ||
|
||
if err := chromedp.Run(ctx, extractLinks(&links)); err != nil { | ||
return nil, fmt.Errorf("failed to extract links: %v", err) | ||
} | ||
|
||
return links, nil | ||
} | ||
|
||
func navigateToWebsite(url string) chromedp.Action { | ||
return chromedp.Navigate(url) | ||
} | ||
|
||
func extractLinks(links *[]string) chromedp.Action { | ||
return chromedp.ActionFunc(func(ctx context.Context) error { | ||
var nodes []*cdp.Node | ||
if err := chromedp.Run(ctx, chromedp.Nodes("a", &nodes)); err != nil { | ||
return err | ||
} | ||
|
||
for _, node := range nodes { | ||
href := node.AttributeValue("href") | ||
if href != "" { | ||
*links = append(*links, href) | ||
} | ||
} | ||
|
||
return nil | ||
}) | ||
} | ||
|
||
func extractContentFromLink(link string) (string, error) { | ||
opts := append( | ||
chromedp.DefaultExecAllocatorOptions[:], | ||
|
||
// chromedp.DefaultExecAllocatorOptions[3:], | ||
// chromedp.NoFirstRun, | ||
// chromedp.NoDefaultBrowserCheck, | ||
) | ||
|
||
parentCtx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) | ||
defer cancel() | ||
ctx, cancel := chromedp.NewContext(parentCtx) | ||
defer cancel() | ||
|
||
var content string | ||
|
||
if err := chromedp.Run(ctx, navigateToWebsite(link)); err != nil { | ||
return "", fmt.Errorf("failed to navigate to link: %v", err) | ||
} | ||
|
||
if err := chromedp.Run(ctx, chromedp.Sleep(3*time.Second)); err != nil { | ||
return "", fmt.Errorf("failed to wait: %v", err) | ||
} | ||
|
||
if err := chromedp.Run(ctx, extractText(&content)); err != nil { | ||
return "", fmt.Errorf("failed to extract text content: %v", err) | ||
} | ||
fmt.Println(content) | ||
|
||
return content, nil | ||
} | ||
|
||
func extractText(content *string) chromedp.Action { | ||
return chromedp.ActionFunc(func(ctx context.Context) error { | ||
var node string | ||
if err := chromedp.Run(ctx, chromedp.Text("body", &node, chromedp.NodeVisible, chromedp.ByQuery)); err != nil { | ||
return err | ||
} | ||
|
||
*content = node | ||
return nil | ||
}) | ||
} | ||
|
||
func containsTermsKeywords(link string) bool { | ||
termsKeywords := []string{"terms", "tos", "terms-of-service", "eula", "end-user-license-agreement", "termsofuse"} | ||
link = strings.ToLower(link) | ||
for _, keyword := range termsKeywords { | ||
if strings.Contains(link, keyword) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func containsPrivacyKeywords(link string) bool { | ||
privacyKeywords := []string{"privacy", "privacy-policy", "policy", "policies"} | ||
link = strings.ToLower(link) | ||
for _, keyword := range privacyKeywords { | ||
if strings.Contains(link, keyword) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func summarizeLinksContent(links []string) (termsSummary, privacySummary string) { | ||
var termsContents, privacyContents []string | ||
|
||
for _, link := range links { | ||
if containsTermsKeywords(link) { | ||
content, err := extractContentFromLink(link) | ||
if err == nil { | ||
termsContents = append(termsContents, content) | ||
fmt.Println(termsContents) | ||
} | ||
} | ||
if containsPrivacyKeywords(link) { | ||
content, err := extractContentFromLink(link) | ||
if err == nil { | ||
privacyContents = append(privacyContents, content) | ||
fmt.Println(privacyContents) | ||
} | ||
} | ||
} | ||
|
||
termsSummary = summarizeContent(termsContents) | ||
privacySummary = summarizeContent(privacyContents) | ||
|
||
return termsSummary, privacySummary | ||
} |
Oops, something went wrong.