This repository has been archived by the owner on Aug 6, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_media_to_topics.js
113 lines (107 loc) · 4.16 KB
/
extract_media_to_topics.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
var fs = require('fs');
const { sanitazeMessage , sanitazeAttachment } = require('./sanitaze')
var crc32 = require('js-crc').crc32;
var username_uid = require('./username_uid.json');
const axios = require('axios')
var excerpt = require('excerpt-html');
const api_keys_by_uid = require('./api_keys_by_uid.json')
async function asyncForEach(array, callback) {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array);
}
}
function compare( a, b ) {
if ( a.utime < b.utime ){
return -1;
}
if ( a.utime > b.utime ){
return 1;
}
return 0;
}
fbid_forumid={}
fs.readdir('F:\\scraper\\scrapped_v2\\', function(err, filenames) {
if (err) {
onError(err);
return;
}
promises=[]
q=0
/* ordena */
let t=[]
filenames.forEach( filename => {
let temp = require('F:\\scraper\\scrapped_v2\\'+filename)
t.push({id: temp.id, utime: parseInt(temp.utime)});
});
filenames = Object.values(t.sort(compare)).map( id_utime => id_utime.id+".json");
asyncForEach(filenames, async (filename,n) => {
let temp = require('F:\\scraper\\scrapped_v2\\'+filename)
let g = sanitazeMessage(temp.userContentHTML);
let userContentHTML = g.messageHTML;
let imgs1 = g.imgs;
g = sanitazeAttachment(temp.attachsHTML);
let attachsHTML = g.attachmentHTML
let imgs2 = g.imgs;
console.log(filename,n,filenames.length-1)
let imgs = imgs1.concat(imgs2);
let permacleaned = cleanUserPermalink(temp.author.permalink);
let hashed = crc32(permacleaned);
let title = excerpt(userContentHTML != "" ? userContentHTML : attachsHTML,{
stripTags: true, // Set to false to get html code
pruneLength: 50, // Amount of characters that the excerpt should contain
})
promises.push(axios({
method: 'post',
url: 'https://1btcarg.com/posts.json',
data: {
title: title.length ? title : "Sin título",
raw: userContentHTML
+ (attachsHTML != "" || imgs.length ? "\r\n **Adjuntos:** \r\n"+attachsHTML + imgs.join("\r\n") : "" )
//+ ("\r\n **Fecha original:** \r\n"+ timeConverter(parseInt(temp.utime)))
+ ("\r\n *fb_postid:* \r\n"+ temp.id),
fbid:temp.id,
created_at: new Date(temp.utime*1000).toJSON()
},
headers: {
'Api-Key': api_keys_by_uid[username_uid[hashed]],
'Api-Username': hashed
}
}));
if( n % 50 == 0 || n == filenames.length-1 ) {
let results = await Promise.allSettled(promises);
results.forEach( (result,i) => {
if(result.status == 'rejected') {
console.log('error',filename,i,result.reason.toJSON());
process.exit(0)
}else{
let fbid=JSON.parse(result.value.config.data).fbid;
console.log(result.value.data)
fs.writeFileSync(`./forum_posts/${fbid}.json`,JSON.stringify({fbid, data: result.value.data}));
}
});
//await new Promise(r => setTimeout(r, 10000));
promises=[]
}
});
});
function timeConverter(UNIX_timestamp){
var a = new Date(UNIX_timestamp * 1000);
var months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'];
var year = a.getFullYear();
var month = months[a.getMonth()];
var date = a.getDate();
var hour = a.getHours();
var min = a.getMinutes();
var sec = a.getSeconds();
var time = date + ' ' + month + ' ' + year + ' ' + hour + ':' + min + ':' + sec ;
return time;
}
function cleanUserPermalink(permalink){
permalink = permalink.replace(/https:\/\/www.facebook.com\//, ``);
permalink = permalink.replace(/profile.php\?id=/, ``);
permalink = permalink.replace(/\&[^]*$/, ``);
permalink = permalink.replace(/\?[^]*$/, ``);
permalink = permalink.replace(/\/$/, ``);
permalink = permalink.replace(/file:\/\/\/F:\//, ``);
return permalink;
}