Skip to content

Commit

Permalink
get posts to flow
Browse files Browse the repository at this point in the history
  • Loading branch information
sokomishalov committed Apr 21, 2021
1 parent 2fb6375 commit d3f0735
Show file tree
Hide file tree
Showing 45 changed files with 434 additions and 476 deletions.
41 changes: 11 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,11 @@ interface:
```kotlin
interface Skraper {
val baseUrl: URLString
val client: SkraperClient get() = DefaultBlockingSkraperClient
suspend fun getProviderInfo(): ProviderInfo?
val name: String
val client: SkraperClient
fun supports(url: String): Boolean
suspend fun getPosts(path: String): Flow<Post>
suspend fun getPageInfo(path: String): PageInfo?
suspend fun getPosts(path: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post>
suspend fun resolve(media: Media): Media
}
```
Expand All @@ -224,8 +225,9 @@ To scrape the latest posts for specific user, channel or trend use skraper like
```kotlin
suspen fun main() {
val skraper = FacebookSkraper()
val posts = skraper.getUserPosts(username = "memes", limit = 2) // extension for getPosts()
println(JsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(posts))
val posts = skraper.getUserPosts(username = "memes").take(2).toList() // extension for getPosts()
val serializer = JsonMapper().writerWithDefaultPrettyPrinter()
println(serializer.writeValueAsString(posts))
}
```

Expand Down Expand Up @@ -273,7 +275,8 @@ It is possible to scrape user/channel/trend info for some purposes:
suspend fun main() {
val skraper = TwitterSkraper()
val pageInfo = skraper.getUserInfo(username = "memes") // extension for `getPageInfo()`
println(JsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(pageInfo))
val serializer = JsonMapper().writerWithDefaultPrettyPrinter()
println(serializer.writeValueAsString(pageInfo))
}
```

Expand Down Expand Up @@ -303,7 +306,8 @@ Sometimes you need to know direct media link:
suspend fun main() {
val skraper = InstagramSkraper()
val info = skraper.resolve(Video(url = "https://www.instagram.com/p/B-flad2F5o7/"))
println(JsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(info))
val serializer = JsonMapper().writerWithDefaultPrettyPrinter()
println(serializer.writeValueAsString(info))
}
```

Expand Down Expand Up @@ -352,29 +356,6 @@ Output:
/var/folders/sf/hm2h5chx5fl4f70bj77xccsc0000gp/T/skraper8377953374796527777/Do_no_harm.jpg
```

### Scrape provider logo

It is also possible to scrape provider info for some purposes:

```kotlin
suspend fun main() {
val skraper = InstagramSkraper()
val info = skraper.getProviderInfo()
println(JsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(info))
}
```

Output:

```json5
{
"name": "Instagram",
"logo": {
"url": "https://instagram.com/favicon.ico"
}
}
```

# Telegram bot

To use the bot follow the [link](https://t.me/SkraperBot).
11 changes: 7 additions & 4 deletions cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import com.xenomachina.argparser.mainBody
import kotlinx.coroutines.asCoroutineDispatcher
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.flow.take
import kotlinx.coroutines.flow.toList
import kotlinx.coroutines.runBlocking
import ru.sokomishalov.skraper.Skrapers
import ru.sokomishalov.skraper.model.Post
Expand All @@ -39,10 +41,11 @@ fun main(args: Array<String>) = mainBody(columns = 100) {
with(t) { println("${green("Skraper")} ${magenta("v.0.7.0")} started") }

val posts = runBlocking {
parsedArgs.skraper.getPosts(
path = "/${parsedArgs.path.removePrefix("/")}",
limit = parsedArgs.amount
)
parsedArgs
.skraper
.getPosts("/${parsedArgs.path.removePrefix("/")}")
.take(parsedArgs.amount)
.toList()
}

when {
Expand Down
26 changes: 11 additions & 15 deletions skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
*/
package ru.sokomishalov.skraper

import kotlinx.coroutines.flow.Flow
import ru.sokomishalov.skraper.client.SkraperClient
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_LIMIT
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.model.*
import ru.sokomishalov.skraper.model.Media
import ru.sokomishalov.skraper.model.PageInfo
import ru.sokomishalov.skraper.model.Post

/**
* Interface for the minimum provider functionality
Expand All @@ -34,23 +36,18 @@ interface Skraper {
/**
* @return provider base url
*/
val baseUrl: URLString
val baseUrl: String

/**
* @return http client
*/
val client: SkraperClient get() = DefaultBlockingSkraperClient

/**
* @param url potential provider relative url
* @return true if such skraper supports this url
*/
suspend fun supports(url: URLString): Boolean = url.host.removePrefix("www.") in baseUrl.host

/**
* @return provider info
* @param path page specific url path (should start with "/")
* @return flow of posts
*/
suspend fun getProviderInfo(): ProviderInfo? = ProviderInfo(name, baseUrl.buildFullURL(path = "/favicon.ico").toImage())
fun getPosts(path: String): Flow<Post>

/**
* @param path page specific url path (should start with "/")
Expand All @@ -59,11 +56,10 @@ interface Skraper {
suspend fun getPageInfo(path: String): PageInfo?

/**
* @param path page specific url path (should start with "/")
* @param limit for an amount of posts to return
* @return list of posts
* @param url potential provider relative url
* @return true if such skraper supports this url
*/
suspend fun getPosts(path: String, limit: Int = DEFAULT_POSTS_LIMIT): List<Post>
fun supports(url: String): Boolean = url.host.removePrefix("www.") in baseUrl.host

/**
* @param media with provider relative url
Expand Down
7 changes: 5 additions & 2 deletions skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegCliRunner
import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegRunner
import ru.sokomishalov.skraper.internal.net.path
import ru.sokomishalov.skraper.model.*
import ru.sokomishalov.skraper.model.Audio
import ru.sokomishalov.skraper.model.Image
import ru.sokomishalov.skraper.model.Media
import ru.sokomishalov.skraper.model.Video
import ru.sokomishalov.skraper.provider.facebook.FacebookSkraper
import ru.sokomishalov.skraper.provider.flickr.FlickrSkraper
import ru.sokomishalov.skraper.provider.ifunny.IFunnySkraper
Expand Down Expand Up @@ -55,7 +58,7 @@ object Skrapers {
* @param url potential provider relative url
* @return skraper which supports this url or null if none of skrapers supports it
*/
suspend fun suitable(url: URLString): Skraper? {
fun suitable(url: String): Skraper? {
return providers.find { it.supports(url) }
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@ package ru.sokomishalov.skraper.client

import ru.sokomishalov.skraper.client.HttpMethodType.GET
import ru.sokomishalov.skraper.internal.consts.DEFAULT_HEADERS
import ru.sokomishalov.skraper.model.URLString

/**
* @author sokomishalov
*/
data class HttpRequest @JvmOverloads constructor(
val url: URLString,
val url: String,
val method: HttpMethodType = GET,
val headers: Map<String, String> = DEFAULT_HEADERS,
val body: ByteArray? = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ package ru.sokomishalov.skraper.internal.consts

internal const val DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
internal const val USER_AGENT_HEADER = "User-Agent"
internal const val DEFAULT_POSTS_LIMIT = 50

@JvmField
internal val DEFAULT_HEADERS = mapOf(USER_AGENT_HEADER to DEFAULT_USER_AGENT)

@JvmField
internal val CRAWLER_USER_AGENTS = setOf("Googlebot", "Slurp", "Yandex", "msnbot", "bingbot")
internal val CRAWLER_USER_AGENTS = setOf("Googlebot", "Slurp", "Yandex", "msnbot", "bingbot")

internal const val DEFAULT_POSTS_BATCH = 50
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
@file:Suppress("NOTHING_TO_INLINE")

package ru.sokomishalov.skraper.internal.iterable

import kotlinx.coroutines.flow.FlowCollector


internal suspend inline fun <T, R> Iterable<T>.emitThis(collector: FlowCollector<R>, transform: T.() -> R) = forEach { collector.emit(transform(it)) }

internal inline fun <T, R> Iterable<T>.mapThis(transform: T.() -> R): List<R> = map(transform)
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import kotlinx.coroutines.Dispatchers.IO
import kotlinx.coroutines.withContext
import ru.sokomishalov.skraper.client.HttpMethodType
import ru.sokomishalov.skraper.client.HttpMethodType.GET
import ru.sokomishalov.skraper.model.URLString
import java.io.DataOutputStream
import java.net.HttpURLConnection
import java.net.HttpURLConnection.*
Expand Down Expand Up @@ -59,10 +58,10 @@ internal suspend fun URL.openRedirectableConnection(
}
}

val URLString.path: String
val String.path: String
get() = URL(this).path

val URLString.host: String
val String.host: String
get() = URL(this).host

private fun HttpURLConnection.applyData(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import java.time.Duration
* @property url media url
*/
sealed class Media {
abstract val url: URLString
abstract val url: String
}

/**
Expand All @@ -31,7 +31,7 @@ sealed class Media {
* @property aspectRatio width to height ratio
*/
data class Image(
override val url: URLString,
override val url: String,
val aspectRatio: Double? = null
) : Media()

Expand All @@ -43,7 +43,7 @@ data class Image(
* @property duration video duration
*/
data class Video(
override val url: URLString,
override val url: String,
val aspectRatio: Double? = null,
val thumbnail: Image? = null,
val duration: Duration? = null
Expand All @@ -55,6 +55,6 @@ data class Video(
* @property duration audio duration
*/
data class Audio(
override val url: URLString,
override val url: String,
val duration: Duration? = null
) : Media()

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ package ru.sokomishalov.skraper.model

import ru.sokomishalov.skraper.internal.string.escapeUrl

internal inline fun URLString.toImage(): Image = Image(url = this)
internal inline fun String.toImage(): Image = Image(url = this)

internal inline fun URLString.toVideo(): Video = Video(url = this)
internal inline fun String.toVideo(): Video = Video(url = this)

internal fun URLString.buildFullURL(path: String, queryParams: Map<String, Any?> = emptyMap()): URLString {
internal fun String.buildFullURL(path: String, queryParams: Map<String, Any?> = emptyMap()): String {
val baseUrlString = removeSuffix("/")
val pathString = "/" + path.removePrefix("/").removeSuffix("/")
val queryParamsString = queryParams
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
package ru.sokomishalov.skraper.provider.facebook

import com.fasterxml.jackson.databind.JsonNode
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import ru.sokomishalov.skraper.Skraper
Expand All @@ -24,6 +26,7 @@ import ru.sokomishalov.skraper.client.SkraperClient
import ru.sokomishalov.skraper.client.fetchDocument
import ru.sokomishalov.skraper.client.fetchOpenGraphMedia
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.iterable.emitThis
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttribute
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttributeValue
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass
Expand All @@ -42,29 +45,29 @@ import java.time.Instant
*/
open class FacebookSkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: URLString = "https://facebook.com"
override val baseUrl: String = "https://facebook.com"
) : Skraper {

override suspend fun getPosts(path: String, limit: Int): List<Post> {
override fun getPosts(path: String): Flow<Post> = flow {
val postsPath = path.substringBefore("/posts") + "/posts"
val page = getPage(path = postsPath)

val posts = page.extractPosts(limit)
val posts = page.extractPosts()
val jsonData = page.extractJsonData()
val metaInfoJsonMap = jsonData.prepareMetaInfoMap()

return posts.map {
val id = it.extractPostId()
posts.emitThis(this) {
val id = extractPostId()
val metaInfoJson = metaInfoJsonMap[id]

Post(
id = id,
text = it.extractPostText(),
publishedAt = it.extractPostPublishDateTime(),
text = extractPostText(),
publishedAt = extractPostPublishDateTime(),
rating = metaInfoJson?.extractPostReactionCount(),
commentsCount = metaInfoJson?.extractPostCommentsCount(),
viewsCount = metaInfoJson?.extractPostViewsCount(),
media = it.extractPostMediaItems()
media = extractPostMediaItems()
)
}
}
Expand Down Expand Up @@ -106,8 +109,7 @@ open class FacebookSkraper @JvmOverloads constructor(
?.get("pre_display_requires")
?.map { it.findPath("__bbox") }
?.mapNotNull { it?.getByPath("result.data.feedback") }
?.map { it.getString("share_fbid").orEmpty() to it }
?.toMap()
?.associate { it.getString("share_fbid").orEmpty() to it }
.orEmpty()
}

Expand All @@ -126,10 +128,9 @@ open class FacebookSkraper @JvmOverloads constructor(
}
}

private fun Document?.extractPosts(limit: Int): List<Element> {
private fun Document?.extractPosts(): List<Element> {
return this
?.getElementsByClass("userContentWrapper")
?.take(limit)
.orEmpty()
}

Expand Down
Loading

0 comments on commit d3f0735

Please sign in to comment.