Skip to content

Commit

Permalink
Vimeo implemented (#314)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikhael-sokolov-rs committed Aug 26, 2022
1 parent 8edc4f0 commit fb307b9
Show file tree
Hide file tree
Showing 10 changed files with 314 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

<name>Skraper</name>
<description>Kotlin/Java library and cli tool for scraping posts and media from various sources with neither authorization nor full page rendering</description>
<url>https://github.com/sokomishalov/skraper/</url>
<url>https://github.com/sokomishalov/skraper</url>

<developers>
<developer>
Expand Down
6 changes: 4 additions & 2 deletions skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import ru.sokomishalov.skraper.provider.tiktok.TikTokSkraper
import ru.sokomishalov.skraper.provider.tumblr.TumblrSkraper
import ru.sokomishalov.skraper.provider.twitch.TwitchSkraper
import ru.sokomishalov.skraper.provider.twitter.TwitterSkraper
import ru.sokomishalov.skraper.provider.vimeo.VimeoSkraper
import ru.sokomishalov.skraper.provider.vk.VkSkraper
import ru.sokomishalov.skraper.provider.youtube.YoutubeSkraper
import java.io.File
Expand Down Expand Up @@ -160,7 +161,7 @@ object Skrapers {

// otherwise try to download as is
else -> {
providers.random().client.download(HttpRequest(url = resolved.url), destFile = destFile)
client.download(HttpRequest(url = resolved.url), destFile = destFile)
destFile
}
}
Expand Down Expand Up @@ -202,7 +203,8 @@ object Skrapers {
TumblrSkraper(),
IFunnySkraper(),
VkSkraper(),
PikabuSkraper()
PikabuSkraper(),
VimeoSkraper(),
)

return spiSkrapers + knownSkrapers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package ru.sokomishalov.skraper.client.ktor

import io.ktor.client.*
import io.ktor.client.plugins.*
import io.ktor.client.request.*
import io.ktor.client.statement.*
import io.ktor.http.ContentType
Expand All @@ -29,6 +30,8 @@ import kotlinx.coroutines.flow.flow
import ru.sokomishalov.skraper.client.HttpRequest
import ru.sokomishalov.skraper.client.HttpResponse
import ru.sokomishalov.skraper.client.SkraperClient
import ru.sokomishalov.skraper.internal.consts.DEFAULT_CONNECTION_TIMEOUT
import ru.sokomishalov.skraper.internal.consts.DEFAULT_READ_TIMEOUT
import ru.sokomishalov.skraper.internal.nio.aWrite
import java.io.File
import io.ktor.client.statement.HttpResponse as KtorHttpResponse
Expand Down Expand Up @@ -82,6 +85,10 @@ class KtorSkraperClient(
@JvmStatic
val DEFAULT_CLIENT: HttpClient = HttpClient {
followRedirects = true
install(HttpTimeout) {
requestTimeoutMillis = DEFAULT_READ_TIMEOUT.toMillis()
connectTimeoutMillis = DEFAULT_CONNECTION_TIMEOUT.toMillis()
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import okio.sink
import ru.sokomishalov.skraper.client.HttpRequest
import ru.sokomishalov.skraper.client.HttpResponse
import ru.sokomishalov.skraper.client.SkraperClient
import ru.sokomishalov.skraper.internal.consts.DEFAULT_CONNECTION_TIMEOUT
import ru.sokomishalov.skraper.internal.consts.DEFAULT_READ_TIMEOUT
import java.io.File
import java.io.IOException
import kotlin.coroutines.resume
Expand Down Expand Up @@ -119,6 +121,8 @@ class OkHttpSkraperClient(
.Builder()
.followRedirects(true)
.followSslRedirects(true)
.connectTimeout(DEFAULT_CONNECTION_TIMEOUT)
.readTimeout(DEFAULT_READ_TIMEOUT)
.build()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
package ru.sokomishalov.skraper.internal.consts

import java.time.Duration

internal const val USER_AGENT_HEADER = "User-Agent"
internal const val ACCEPT_LANGUAGE_HEADER = "Accept-Language"
internal const val DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
Expand All @@ -23,4 +25,7 @@ internal const val DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US"
@JvmField
internal val DEFAULT_HEADERS = mapOf(USER_AGENT_HEADER to DEFAULT_USER_AGENT, ACCEPT_LANGUAGE_HEADER to DEFAULT_ACCEPT_LANGUAGE_HEADER)

internal const val DEFAULT_POSTS_BATCH = 50
internal const val DEFAULT_POSTS_BATCH = 50

internal val DEFAULT_CONNECTION_TIMEOUT = Duration.ofSeconds(5)
internal val DEFAULT_READ_TIMEOUT = Duration.ofMinutes(1)
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import kotlinx.coroutines.Dispatchers.IO
import kotlinx.coroutines.withContext
import ru.sokomishalov.skraper.client.HttpMethodType
import ru.sokomishalov.skraper.client.HttpMethodType.GET
import ru.sokomishalov.skraper.internal.consts.DEFAULT_CONNECTION_TIMEOUT
import ru.sokomishalov.skraper.internal.consts.DEFAULT_READ_TIMEOUT
import java.io.DataOutputStream
import java.net.HttpURLConnection
import java.net.HttpURLConnection.*
Expand Down Expand Up @@ -75,6 +77,6 @@ private fun HttpURLConnection.applyData(
doOutput = true
DataOutputStream(outputStream).use { wr -> wr.write(it) }
}
connectTimeout = 5_000
readTimeout = 5_000
connectTimeout = DEFAULT_CONNECTION_TIMEOUT.toMillis().toInt()
readTimeout = DEFAULT_READ_TIMEOUT.toMillis().toInt()
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
/*
* Copyright (c) 2019-present Mikhael Sokolov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ru.sokomishalov.skraper.provider.vimeo

import com.fasterxml.jackson.databind.JsonNode
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import org.jsoup.nodes.Document
import ru.sokomishalov.skraper.Skraper
import ru.sokomishalov.skraper.Skrapers
import ru.sokomishalov.skraper.client.*
import ru.sokomishalov.skraper.internal.consts.DEFAULT_HEADERS
import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_BATCH
import ru.sokomishalov.skraper.internal.iterable.emitBatch
import ru.sokomishalov.skraper.internal.jsoup.getMetaPropertyMap
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.internal.number.div
import ru.sokomishalov.skraper.internal.serialization.getByPath
import ru.sokomishalov.skraper.internal.serialization.getInt
import ru.sokomishalov.skraper.internal.serialization.getLong
import ru.sokomishalov.skraper.internal.serialization.getString
import ru.sokomishalov.skraper.model.*
import java.time.Duration
import java.time.Instant
import java.time.ZonedDateTime

open class VimeoSkraper @JvmOverloads constructor(
override val client: SkraperClient = Skrapers.client
) : Skraper {

override fun getPosts(path: String): Flow<Post> = flow {
val document = getPage(path)

val jwt = acquireJwt() ?: return@flow

val fetcher: suspend (Int) -> List<JsonNode> = { page ->
if (path.removePrefix("/").startsWith("categories")) {
val category = path.substringAfter("categories/").substringBefore("/")
val subCategory = path.substringAfter("?", missingDelimiterValue = "").takeIf { it.isNotEmpty() }?.substringAfter("subcategory=")?.substringBefore("&")
fetchSearchPosts(category, subCategory, jwt, page)
} else {
val properties = document.getMetaPropertyMap()
val userId = (properties["al:ios:url"] ?: properties["al:android:url"])?.substringAfterLast("/").orEmpty()
val uri = fetchDefaultSectionUri(userId, jwt)
fetchSectionPosts(uri.orEmpty(), jwt, page)
}
}

while (true) {
var page = 1
val rawPosts = fetcher(page)

if (rawPosts.isEmpty()) break

emitBatch(rawPosts) {
Post(
id = getString("clip.link")?.substringAfterLast("/").orEmpty(),
text = getString("clip.name").orEmpty(),
publishedAt = runCatching { ZonedDateTime.parse(getString("clip.created_time")).toEpochSecond().let { Instant.ofEpochSecond(it) } }.getOrNull(),
media = listOf(Video(
url = getString("clip.link").orEmpty(),
thumbnail = getByPath("clip.pictures.sizes")?.lastOrNull()?.getString("link")?.toImage(),
aspectRatio = getByPath("clip.download")?.lastOrNull()?.let { getInt("width") / getInt("height") },
duration = getLong("clip.duration")?.let { Duration.ofSeconds(it) }
))
)
}

++page
}
}

override suspend fun getPageInfo(path: String): PageInfo? {
val document = getPage(path)
val properties = document.getMetaPropertyMap()

return PageInfo(
nick = properties["og:url"]?.substringAfterLast("/"),
name = properties["og:title"],
description = properties["og:description"].orEmpty(),
avatar = properties["og:image"]?.toImage(),
)
}

override fun supports(url: String): Boolean {
return "vimeo.com" in url.host
}

override suspend fun resolve(media: Media): Media {
return when (media) {
is Video -> {
val openGraphMedia = client.fetchOpenGraphMedia(media)
val videoConfigUrl = openGraphMedia.url.substringBeforeLast("?") + "/config?default_to_hd=1"
val configJson = client.fetchJson(HttpRequest(videoConfigUrl)) ?: return media

with(configJson) {
media.copy(
url = getByPath("request.files.progressive")?.lastOrNull()?.getString("url") ?: media.url,
thumbnail = media.thumbnail ?: getString("video.thumbs.base")?.toImage(),
duration = media.duration ?: getLong("video.duration")?.let { Duration.ofSeconds(it) },
aspectRatio = media.aspectRatio ?: (getInt("video.width") / getInt("video.height")),
)
}
}

else -> media
}
}

private suspend fun getPage(path: String): Document? = client.fetchDocument(
request = HttpRequest(url = BASE_URL.buildFullURL(path = path))
)

private suspend fun acquireJwt(): String? {
return client.fetchJson(HttpRequest(
url = BASE_URL.buildFullURL(path = "/_next/jwt"),
headers = DEFAULT_HEADERS + mapOf(
"Connection" to "keep-alive",
"x-requested-with" to "XMLHttpRequest",
)
))?.getString("token")
}

private suspend fun fetchDefaultSectionUri(userId: String, jwt: String): String? {
return client.fetchJson(HttpRequest(
url = API_BASE_URL.buildFullURL(path = "/users/${userId}/profile_sections"),
headers = mapOf("Authorization" to "jwt $jwt")
))?.getString("data.0.uri")
}

private suspend fun fetchSectionPosts(uri: String, jwt: String, page: Int): List<JsonNode> {
return client.fetchJson(HttpRequest(
url = API_BASE_URL.buildFullURL(
path = "$uri/videos",
queryParams = mapOf(
"page" to page,
"per_page" to DEFAULT_POSTS_BATCH
)
),
headers = mapOf(
"Authorization" to "jwt $jwt",
"Content-Type" to "application/json"
)
))?.getByPath("data")?.toList().orEmpty()
}

private suspend fun fetchSearchPosts(category: String, subCategory: String?, jwt: String, page: Int): List<JsonNode> {
return client.fetchJson(HttpRequest(
url = API_BASE_URL.buildFullURL(
path = "/search",
queryParams = mapOf(
"page" to page,
"per_page" to DEFAULT_POSTS_BATCH,
"direction" to "desc",
"_video_override" to "true",
"c" to "b",
"query" to "",
"filter_type" to "clip",
"filter_category" to category,
) + when {
subCategory != null -> mapOf("filter_subcategory" to subCategory)
else -> emptyMap()
}
),
headers = mapOf(
"Authorization" to "jwt $jwt",
"Content-Type" to "application/json"
)
))?.getByPath("data")?.toList().orEmpty()
}

companion object {
const val BASE_URL: String = "https://vimeo.com"
const val API_BASE_URL: String = "https://api.vimeo.com/"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2019-present Mikhael Sokolov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ru.sokomishalov.skraper.provider.vimeo

import kotlinx.coroutines.flow.Flow
import ru.sokomishalov.skraper.model.PageInfo
import ru.sokomishalov.skraper.model.Post


/**
* @author sokomishalov
*/

fun VimeoSkraper.getUserPosts(username: String): Flow<Post> {
return getPosts(path = "/${username}")
}

fun VimeoSkraper.getCategoryPosts(category: String, subCategory: String? = null): Flow<Post> {
return getPosts(path = "/categories/$category/videos${subCategory?.let { "?subcategory=$it" }.orEmpty()}")
}

suspend fun VimeoSkraper.getUserInfo(username: String): PageInfo? {
return getPageInfo(path = "/${username}")
}
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ abstract class SkraperTck {
filename = UUID.randomUUID().toString()
)
}
}.onFailure {
log.info("Media download failed", it)
}.getOrNull()

assertNotNull(downloaded)
Expand All @@ -145,7 +147,7 @@ abstract class SkraperTck {
protected suspend fun <T> log(action: suspend Skraper.() -> T): T? {
return runCatching { skraper.action() }
.onSuccess { log.info(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(it)) }
.onFailure { throw AssertionError("Exception occured", it) }
.onFailure { throw AssertionError("Exception occurred", it) }
.getOrNull()
}
}
Loading

0 comments on commit fb307b9

Please sign in to comment.