Skip to content

Commit

Permalink
new model
Browse files Browse the repository at this point in the history
  • Loading branch information
sokomishalov committed Feb 10, 2020
1 parent a3b3842 commit f5b1bc5
Show file tree
Hide file tree
Showing 40 changed files with 1,154 additions and 365 deletions.
26 changes: 13 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,18 @@ Each scraper is a class which implements [Skraper](skraper-core/src/main/kotlin/
interface Skraper {
val baseUrl: String
val client: SkraperClient get() = DefaultBlockingSkraperClient
suspend fun getLatestPosts(uri: String, limit: Int = 100): List<Post>
suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize = ImageSize.SMALL): String?
suspend fun getLogoUrl(imageSize: ImageSize = ImageSize.SMALL): String? = "${baseUrl}/favicon.ico"
suspend fun getPosts(path: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post>
suspend fun getLogoUrl(path: String, imageSize: ImageSize = ImageSize.SMALL): String?
suspend fun getProviderLogoUrl(imageSize: ImageSize = ImageSize.SMALL): String? = "${baseUrl}/favicon.ico"
}
```

### The latest user/channel/trend posts
### Scrape user/community/channel/topic/trend posts
To scrape the latest posts for specific user, channel or trend use skraper like that:
```kotlin
fun main() = runBlocking {
val skraper = FacebookSkraper()
val posts = skraper.getLatestPosts(uri = "/memes", limit = 2)
val posts = skraper.getPosts(path = "/memes", limit = 2)
println(JsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(posts))
}
```
Expand All @@ -85,8 +85,8 @@ Received data structure is similar to each other provider's. Output data example
[
{
"id" : "5029851093699104",
"caption" : "gotta love em!",
"publishTimestamp" : 1580744400000,
"text" : "gotta love em!",
"publishedAt" : 1580744400000,
"rating" : 79,
"commentsCount" : 3,
"attachments" : [ {
Expand All @@ -96,8 +96,8 @@ Received data structure is similar to each other provider's. Output data example
} ]
}, {
"id" : "4990218157662398",
"caption" : "Interesting",
"publishTimestamp" : 1580742000000,
"text" : "Interesting",
"publishedAt" : 1580742000000,
"rating" : 3092,
"commentsCount" : 514,
"attachments" : [ {
Expand All @@ -111,12 +111,12 @@ Received data structure is similar to each other provider's. Output data example

You can see the full model structure for posts and others [here](skraper-core/src/main/kotlin/ru/sokomishalov/skraper/model)

### Get user/channel/trend logo
### Scrape user/community/channel/topic/trend logo
It is possible to scrape user/channel/trend logo for some purposes:
```kotlin
fun main() = runBlocking {
val skraper = TwitterSkraper()
val pageLogo = skraper.getPageLogoUrl(uri = "/memes")
val pageLogo = skraper.getLogoUrl(path = "/memes")
println(pageLogo)
}
```
Expand All @@ -126,13 +126,13 @@ Output:
https://pbs.twimg.com/profile_images/824808708332941313/mJ4xM6PH_400x400.jpg
```

### Get provider logo
### Scrape provider logo
It is also possible to scrape provider logo for some purposes:

```kotlin
fun main() = runBlocking {
val skraper = InstagramSkraper()
val logo = skraper.getLogoUrl()
val logo = skraper.getProviderLogoUrl()
println(logo)
}
```
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
<jackson.version>2.10.2</jackson.version>
<commons-text.version>1.8</commons-text.version>
<logback.version>1.2.3</logback.version>
<jsoup.version>1.12.1</jsoup.version>
<jsoup.version>1.12.2</jsoup.version>
<reactor-netty.version>0.9.4.RELEASE</reactor-netty.version>
<okhttp3.version>4.3.1</okhttp3.version>
<spring.version>5.2.3.RELEASE</spring.version>
Expand Down
14 changes: 7 additions & 7 deletions skraper-core/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
package ru.sokomishalov.skraper

import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.consts.DEFAULT_LOGO_SIZE
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_LIMIT
import ru.sokomishalov.skraper.model.ImageSize
import ru.sokomishalov.skraper.model.ImageSize.SMALL
import ru.sokomishalov.skraper.model.Post

/**
Expand All @@ -37,22 +37,22 @@ interface Skraper {
val client: SkraperClient get() = DefaultBlockingSkraperClient

/**
* @param uri specific uri for the page
* @param limit limit for an amount of posts to return
* @param path page specific url path
* @param limit for an amount of posts to return
* @return list of posts
*/
suspend fun getLatestPosts(uri: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post>
suspend fun getPosts(path: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post>

/**
* @param uri specific uri for the page
* @param path page specific url path
* @param imageSize choice for specific logo size if it's possible
* @return page logo url
*/
suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize = SMALL): String?
suspend fun getLogoUrl(path: String, imageSize: ImageSize = DEFAULT_LOGO_SIZE): String?

/**
* @param imageSize choice for specific logo size if it's possible
* @return provider logo url
*/
suspend fun getProviderLogoUrl(imageSize: ImageSize = SMALL): String? = "${baseUrl}/favicon.ico"
suspend fun getProviderLogoUrl(imageSize: ImageSize = DEFAULT_LOGO_SIZE): String? = "${baseUrl}/favicon.ico"
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ package ru.sokomishalov.skraper.client.spring

import io.netty.handler.ssl.SslContextBuilder
import io.netty.handler.ssl.util.InsecureTrustManagerFactory
import kotlinx.coroutines.reactive.awaitFirstOrNull
import org.springframework.http.client.reactive.ReactorClientHttpConnector
import org.springframework.web.reactive.function.client.ExchangeStrategies
import org.springframework.web.reactive.function.client.WebClient
import org.springframework.web.reactive.function.client.bodyToMono
import org.springframework.web.reactive.function.client.awaitBodyOrNull
import org.springframework.web.reactive.function.client.awaitExchange
import reactor.netty.http.client.HttpClient
import ru.sokomishalov.skraper.SkraperClient

Expand All @@ -39,10 +39,8 @@ class SpringReactiveSkraperClient(
.get()
.uri(url)
.headers { headers.forEach { (k, v) -> it[k] = v } }
.exchange()
.awaitFirstOrNull()
?.bodyToMono<ByteArray>()
?.awaitFirstOrNull()
.awaitExchange()
.awaitBodyOrNull()
}

companion object {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
*/
package ru.sokomishalov.skraper.internal.consts

import ru.sokomishalov.skraper.model.ImageSize
import ru.sokomishalov.skraper.model.ImageSize.SMALL

/**
* @author sokomishalov
*/

const val DEFAULT_POSTS_LIMIT: Int = 100
const val DEFAULT_POSTS_ASPECT_RATIO: Double = 1.0
const val DEFAULT_POSTS_LIMIT: Int = 50
const val DEFAULT_POSTS_ASPECT_RATIO: Double = 1.0
val DEFAULT_LOGO_SIZE: ImageSize = SMALL

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByAttributeOrNull
import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByClassOrNull
import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByTagOrNull
import ru.sokomishalov.skraper.internal.serialization.aReadJsonNodes
import ru.sokomishalov.skraper.internal.url.uriCleanUp
import ru.sokomishalov.skraper.model.Attachment
import ru.sokomishalov.skraper.model.AttachmentType.IMAGE
import ru.sokomishalov.skraper.model.AttachmentType.VIDEO
Expand All @@ -39,47 +38,45 @@ import kotlin.text.Charsets.UTF_8
/**
* @author sokomishalov
*/
class FacebookSkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient
class FacebookSkraper(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://facebook.com"
) : Skraper {

override val baseUrl: String = "https://facebook.com"

override suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize): String? {
val document = getPage(uri)

return document
?.getElementsByAttributeValue("property", "og:image")
?.firstOrNull()
?.attr("content")
}

override suspend fun getLatestPosts(uri: String, limit: Int): List<Post> {
val document = getPage(uri)
override suspend fun getPosts(path: String, limit: Int): List<Post> {
val document = getPage(path = path)

val elements = document.extractPosts(limit)
val jsonData = document.extractJsonData()
val metaInfoJsonMap = jsonData.prepareMetaInfoMap()

return elements.map {
val id = it.getIdByUserContentWrapper()
val id = it.extractId()
val node = metaInfoJsonMap[id]

Post(
id = id,
text = it.getCaptionByUserContentWrapper(),
publishedAt = it.getPublishedAtByUserContentWrapper(),
text = it.extractText(),
publishedAt = it.extractPublishDateTime(),
rating = node.extractReactionCount(),
commentsCount = node.extractCommentsCount(),
attachments = it.getAttachmentsByUserContentWrapper()
attachments = it.extractAttachments()
)
}
}

private suspend fun getPage(uri: String): Document? {
return client.fetchDocument("${baseUrl}/${uri.uriCleanUp()}/posts")
override suspend fun getLogoUrl(path: String, imageSize: ImageSize): String? {
val document = getPage(path = path)

return document
?.getElementsByAttributeValue("property", "og:image")
?.firstOrNull()
?.attr("content")
}


private suspend fun getPage(path: String): Document? = client.fetchDocument("$baseUrl$path")

private fun JsonNode?.prepareMetaInfoMap(): Map<String, JsonNode> {
return this
?.get("pre_display_requires")
Expand Down Expand Up @@ -115,21 +112,21 @@ class FacebookSkraper @JvmOverloads constructor(
.orEmpty()
}

private fun Element.getIdByUserContentWrapper(): String {
private fun Element.extractId(): String {
return getElementsByAttributeValue("name", "ft_ent_identifier")
?.firstOrNull()
?.attr("value")
.orEmpty()
}

private fun Element.getCaptionByUserContentWrapper(): String? {
private fun Element.extractText(): String? {
return getSingleElementByClassOrNull("userContent")
?.getSingleElementByTagOrNull("p")
?.wholeText()
?.toString()
}

private fun Element.getPublishedAtByUserContentWrapper(): Long? {
private fun Element.extractPublishDateTime(): Long? {
return getSingleElementByAttributeOrNull("data-utime")
?.attr("data-utime")
?.toLongOrNull()
Expand All @@ -150,7 +147,7 @@ class FacebookSkraper @JvmOverloads constructor(
?.asInt()
}

private fun Element.getAttachmentsByUserContentWrapper(): List<Attachment> {
private fun Element.extractAttachments(): List<Attachment> {
val videoElement = getSingleElementByTagOrNull("video")

return when {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* Copyright 2019-2020 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ru.sokomishalov.skraper.provider.facebook

import ru.sokomishalov.skraper.internal.consts.DEFAULT_LOGO_SIZE
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_LIMIT
import ru.sokomishalov.skraper.model.ImageSize
import ru.sokomishalov.skraper.model.Post


/**
* @author sokomishalov
*/

suspend fun FacebookSkraper.getUserPosts(username: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post> {
return getPosts(path = username.buildUserPath(), limit = limit)
}

suspend fun FacebookSkraper.getUserLogoUrl(username: String, imageSize: ImageSize = DEFAULT_LOGO_SIZE): String? {
return getLogoUrl(path = username.buildUserPath(), imageSize = imageSize)
}


private fun String.buildUserPath(): String = "/${this}/posts"
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.fetchDocument
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_ASPECT_RATIO
import ru.sokomishalov.skraper.internal.jsoup.getSingleElementByTag
import ru.sokomishalov.skraper.internal.url.uriCleanUp
import ru.sokomishalov.skraper.model.Attachment
import ru.sokomishalov.skraper.model.AttachmentType.IMAGE
import ru.sokomishalov.skraper.model.AttachmentType.VIDEO
Expand All @@ -32,14 +31,13 @@ import ru.sokomishalov.skraper.model.Post
/**
* @author sokomishalov
*/
class IFunnySkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient
class IFunnySkraper(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://ifunny.co"
) : Skraper {

override val baseUrl: String = "https://ifunny.co"

override suspend fun getLatestPosts(uri: String, limit: Int): List<Post> {
val document = getTopicPage(uri)
override suspend fun getPosts(path: String, limit: Int): List<Post> {
val document = getPage(path = path)

val posts = document
?.getElementsByClass("stream__item")
Expand Down Expand Up @@ -74,11 +72,14 @@ class IFunnySkraper @JvmOverloads constructor(
}
}

override suspend fun getPageLogoUrl(uri: String, imageSize: ImageSize): String? {
return getProviderLogoUrl(imageSize)
}
override suspend fun getLogoUrl(path: String, imageSize: ImageSize): String? {
val document = getPage(path = path)

private suspend fun getTopicPage(uri: String): Document? {
return client.fetchDocument("${baseUrl}/${uri.uriCleanUp()}")
return document
?.getElementsByAttributeValue("property", "og:image")
?.firstOrNull()
?.attr("content")
}

private suspend fun getPage(path: String): Document? = client.fetchDocument("${baseUrl}${path}")
}
Loading

0 comments on commit f5b1bc5

Please sign in to comment.