-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_collection.go
86 lines (72 loc) · 1.86 KB
/
data_collection.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package main
import (
"io/ioutil"
"os"
"path/filepath"
"strings"
"time"
)
type ContentType string
const (
Journal ContentType = "journal"
BlogPost ContentType = "blog_post"
SocialPost ContentType = "social_post"
Reference ContentType = "reference"
)
type Content struct {
Type ContentType
Text string
Date time.Time
Tags []string
SourceURL string // For references or social media posts
}
func collectTextData(directory string) ([]Content, error) {
var allContent []Content
// Walk through all subdirectories
err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && strings.HasSuffix(info.Name(), ".txt") {
content, err := ioutil.ReadFile(path)
if err != nil {
return err
}
// Parse filename for metadata
fileName := info.Name()
datePart := strings.Split(fileName, "_")[0]
date, _ := time.Parse("2006-01-02", datePart)
// Determine content type based on directory
var contentType ContentType
if strings.Contains(path, "journals") {
contentType = Journal
} else if strings.Contains(path, "blog_posts") {
contentType = BlogPost
} else if strings.Contains(path, "social_media") {
contentType = SocialPost
} else if strings.Contains(path, "references") {
contentType = Reference
}
// Extract tags from content (assuming tags are in the format #tag)
tags := extractTags(string(content))
allContent = append(allContent, Content{
Type: contentType,
Text: string(content),
Date: date,
Tags: tags,
})
}
return nil
})
return allContent, err
}
func extractTags(text string) []string {
var tags []string
words := strings.Fields(text)
for _, word := range words {
if strings.HasPrefix(word, "#") {
tags = append(tags, strings.TrimPrefix(word, "#"))
}
}
return tags
}