diff --git a/Dockerfile b/Dockerfile index b725dc9..3818fdf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,30 +7,21 @@ COPY go.sum . RUN go mod download COPY . . -RUN make build +RUN go build -v -o ipfs-crawler cmd/ipfs-crawler/main.go FROM debian:bullseye-slim AS runner -# Create a system user to drop into. -RUN groupadd -r ipfs \ - && useradd --no-log-init -r -g ipfs ipfs \ - && mkdir -p ipfs - # Enter our working directory. WORKDIR libp2p-crawler # Copy compiled binaries from builder. -COPY --from=builder /usr/src/ipfs-crawler/cmd/ipfs-crawler/ipfs-crawler ./libp2p-crawler +COPY --from=builder /usr/src/ipfs-crawler/ipfs-crawler ./libp2p-crawler COPY --from=builder /usr/src/ipfs-crawler/dist/docker_entrypoint.sh . COPY --from=builder /usr/src/ipfs-crawler/dist/config_ipfs.yaml ./config/config_ipfs.yaml COPY --from=builder /usr/src/ipfs-crawler/dist/config_filecoin_mainnet.yaml ./config/config_filecoin_mainnet.yaml -# Set ownership. -RUN chown -R ipfs:ipfs ./libp2p-crawler - -# Drop root. -USER ipfs +# Link IPFS config to be executed by default +RUN ln -s ./config/config_ipfs.yaml config.yaml # Run the binary. -ENTRYPOINT ["./docker_entrypoint.sh","--config","./config/config_ipfs.yaml"] - +ENTRYPOINT ["./docker_entrypoint.sh", "--config", "config.yaml"] \ No newline at end of file diff --git a/LICENSE b/LICENSE index 9e9a813..03ee2f2 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Sebastian Henningsen +Copyright (c) 2020-2023 Sebastian Henningsen & Leo Balduf Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile deleted file mode 100644 index 492143a..0000000 --- a/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -build: - go build cmd/ipfs-crawler/main.go - mv main cmd/ipfs-crawler/ipfs-crawler - -preimages: - go build cmd/hash-precomputation/main.go - mv main cmd/hash-precomputation/hash-precomputation - ./cmd/hash-precomputation/hash-precomputation - mkdir -p precomputed_hashes - mv preimages.csv precomputed_hashes/preimages.csv - -clean: - rm cmd/ipfs-crawler/crawler - -all: preimages build diff --git a/README.md b/README.md index 4b45284..f957f83 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# A Crawler for the Kademlia-part of the IPFS-network +# Libp2p-Crawler + +A crawler for the Kademlia-part of various libp2p networks. **For more details, see [our paper](https://arxiv.org/abs/2002.07747).** @@ -14,27 +16,6 @@ Sebastian A. Henningsen, Sebastian Rust, Martin Florian, Björn Scheuermann: For a Live Version of the crawler results, check out our [Periodic Measurements of the IPFS Network](https://trudi.weizenbaum-institut.de/ipfs_analysis.html) -## In a Nutshell - -This crawler is designed to enumerate all reachable nodes within the DHT/KAD-part of libp2p networks and return their neighborhood graph. -For each node it saves -* The ID -* All known multiaddresses that were found in the DHT -* If a connection could be established -* All peers in the routing table of the peer, if crawling succeeded -* The agent version, if the identify protocol succeeded -* Supported protocols, if the identify protocol succeeded -* Plugin-extensible metadata - -This is achieved by sending multiple `FindNode`-requests to each node in the network, targeted in such a way that each request extracts the contents of exactly one DHT bucket. - -The crawler is optimized for speed, to generate as accurate snapshots as possible. -It starts from the (configurable) bootstrap nodes, polls their buckets and continues to connect to every peer it has not seen so far. - -For an in-depth dive and discussion to the crawler and the obtained results, you can watch @scriptkitty's talk at ProtocolLabs: - -[![Link to YouTube](https://img.youtube.com/vi/jQI37Y25jwk/1.jpg)](https://www.youtube.com/watch?v=jQI37Y25jwk) - ## Building You can build this in a containerized environment. @@ -43,27 +24,51 @@ This will build on Debian Bullseye and extract the compiled binary to `out/`: ./build-in-docker.sh ``` -## Run one or multiple crawls +This is the preferred way of compilation. +You can also manually compile the crawler. +This will need an older version of Go installed, since the most recent version is usually not supported by the QUIC implementation. -To run a single crawl simply do: +## Usage +To crawl the network once, execute the crawler with the corresponding config file: ```bash -make build -./start_crawl.sh +export LIBP2P_ALLOW_WEAK_RSA_KEYS="" && export LIBP2P_SWARM_FD_LIMIT="10000" && ./out/libp2p-crawler --config dist/config_ipfs.yaml ``` -**Important note:** We ship the pre-images necessary for a successful crawl, but you can compute them yourself with `make preimages`. -Note that the preimages only have to be computed *once*, it'll take some minutes, to compute them, though. - One crawl will take 5-10 minutes, depending on your machine. -For multiple crawls, use the `autocrawl.sh` script instead of `start_crawl.sh` in the last line. It takes a duration in days and an optional directory to put logs into. -Note that there will be a lot of output on your disk, one week of crawling (without logs) can lead to 30-50GB of data! -The complete workflow is: +### Docker + +The image executes `dist/docker_entrypoint.sh` by default, which will set the environment variables and launch the crawler with all arguments provided to it. +This loads a config file located at `/libp2p-crawler/config.yaml` in the image. +You can thus override the executed config by mounting a different file to this location. + +You'll need to mount the precomputed hashes as well as an output directory. +The working directory of the container is `/libp2p-crawler`. +A typical invocation could look like this: + +```bash +docker run -it --rm \ + -v ./dist/config_ipfs.yaml:/libp2p-crawler/config.yaml \ + -v ./precomputed_hashes:/libp2p-crawler/precomputed_hashes \ + -v ./output_data_crawls:/libp2p-crawler/output_data_crawls \ + trudi-group/ipfs-crawler:latest +``` + +The crawler runs as `root` within the container and, thus, also writes files as `uid` `0`. +This is somewhat annoying on the host, since files in the mapped output directory will also be owned by `root`. + +### Computing Preimages + +**Important note:** We ship the pre-images necessary for a successful crawl, but you can compute them yourself with `make preimages`. +Note that the preimages only have to be computed *once*, it'll take some minutes, to compute them, though. ```bash -make build -./autocrawl [-l logdir] +go build cmd/hash-precomputation/main.go +mv main cmd/hash-precomputation/hash-precomputation +./cmd/hash-precomputation/hash-precomputation +mkdir -p precomputed_hashes +mv preimages.csv precomputed_hashes/preimages.csv ``` ## Configuration @@ -73,6 +78,33 @@ Example configurations with sane defaults are provided in [dist/](dist): - [dist/config_ipfs.yaml](dist/config_ipfs.yaml) contains a configuration to crawl the IPFS network. - [dist/config_filecoin_mainnet.yaml](dist/config_filecoin_mainnet.yaml) contains a configuration to crawl the Filecoin mainnet. +### Bootstrap Peers + +The crawler needs to know which peers to use to start a crawl. +These are configured via the configuration file. +To get the default bootstrap peers of an IPFS node, simply run ```./ipfs bootstrap list > bootstrappeers.txt```. + +## In a Nutshell + +This crawler is designed to enumerate all reachable nodes within the DHT/KAD-part of libp2p networks and return their neighborhood graph. +For each node it saves +* The ID +* All known multiaddresses that were found in the DHT +* If a connection could be established +* All peers in the routing table of the peer, if crawling succeeded +* The agent version, if the identify protocol succeeded +* Supported protocols, if the identify protocol succeeded +* Plugin-extensible metadata + +This is achieved by sending multiple `FindNode`-requests to each node in the network, targeted in such a way that each request extracts the contents of exactly one DHT bucket. + +The crawler is optimized for speed, to generate as accurate snapshots as possible. +It starts from the (configurable) bootstrap nodes, polls their buckets and continues to connect to every peer it has not seen so far. + +For an in-depth dive and discussion to the crawler and the obtained results, you can watch @scriptkitty's talk at ProtocolLabs: + +[![Link to YouTube](https://img.youtube.com/vi/jQI37Y25jwk/1.jpg)](https://www.youtube.com/watch?v=jQI37Y25jwk) + ## Evaluation of Results After running a few crawls, the output directory should have some data in it. @@ -99,12 +131,6 @@ The next crawl will then not only start at the boot nodes but also add all previ This can increase the crawl speed, and therefore the accuracy of the snapshots, significantly. Due to node churn, this setting is most reasonable when performing many consecutive crawls. -### Sanity Check ("Canary Peers") - -The crawler enumerates the nodes in the network, but without ground truth it is hard to assess the quality and completeness of a crawl. -Therefore, it might be desirable to check whether some known IPFS-nodes appear in the crawl. -This functionality used to exist in the crawler, but we believe it is more convenient to check externally using common UNIX tools. - ## Output of a crawl A crawl writes two files to the output directory configured via the configuration file: @@ -207,12 +233,6 @@ which says that the peer with ID `12D3KooWD9QV2...` had an entry for peer `12D3K If `target_crawlable` is `false`, this indicates that the crawler was not able to connect to or enumerate all of `target`'s peers. Since some nodes reside behind NATs or are otherwise uncooperative, this is not uncommon to see. -## Bootstrap Peers - -The crawler needs to know which peers to use to start a crawl. -These are configured via the configuration file. -To get the default bootstrap peers of an IPFS node, simply run ```./ipfs bootstrap list > bootstrappeers.txt```. - ## Libp2p complains about key lengths Libp2p uses a minimum keylenght of [2048 bit](https://github.com/libp2p/go-libp2p-core/blob/master/crypto/rsa_common.go), whereas IPFS uses [512 bit](https://github.com/ipfs/infra/issues/378). @@ -231,4 +251,8 @@ Please raise the maximum number of sockets on linux via ```bash ulimit -n unlimited ``` -or equivalent commands on different platforms +or equivalent commands on different platforms. + +## License + +MIT, see [LICENSE](LICENSE). \ No newline at end of file diff --git a/autocrawl.sh b/autocrawl.sh deleted file mode 100755 index 229ba49..0000000 --- a/autocrawl.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash - -# USAGE: ./autocrawl.sh [-l] -# Duration is given in days, convert to seconds -usage="$0 [-l ] \n - -l: log directory *that has to exist*. If not specified, logs are written to /dev/null" - -logdir="" - -case $1 in - -l ) shift - logdir=$1 - if [[ ! -d $logdir ]]; then - echo "$dir is not a directory, will not keep logs there." - exit 1 - fi - echo "Outputting logs to $logdir." - shift - ;; -esac - -if [[ "$logdir" == "" ]]; then - echo "Not keeping logs." -fi - -if [[ "$1" == "" ]]; then - echo "No duration given." - echo -e $usage - exit 1 -fi - -if ! [[ "$1" =~ ^[0-9]+$ ]] - then - echo "Duration must be integer" - exit 1 -fi - -duration=$1 - -secondDuration=$duration*24*3600 -startTime=$(date +%s) -endTime=$((startTime+secondDuration)) - -counter=1 - -echo -e "Started crawling at $(date --date=@+$startTime).\nWill crawl until $(date --date=@+$endTime)." - -while [[ $(date +%s) -le $endTime ]] -do - echo "Crawl no. $counter" - if [[ "$logdir" == "" ]]; then - ./start_crawl.sh 2> /dev/null - else - ./start_crawl.sh 2> $logdir/crawl_log_"$(date --rfc-3339='seconds')"_$counter - fi - ((counter++)) -done \ No newline at end of file diff --git a/cmd/ipfs-crawler/main.go b/cmd/ipfs-crawler/main.go index 43e4721..c0da2b4 100644 --- a/cmd/ipfs-crawler/main.go +++ b/cmd/ipfs-crawler/main.go @@ -34,9 +34,9 @@ func main() { var configFilePath string var help bool - flag.BoolVar(&debug, "debug", false, "whether to enable debug logging") + flag.BoolVar(&debug, "debug", false, "enable debug logging") flag.StringVar(&configFilePath, "config", "dist/config_ipfs.yaml", "path to the configuration file") - flag.BoolVar(&help, "help", false, "Print usage.") + flag.BoolVar(&help, "help", false, "print usage") flag.Parse() if help { diff --git a/dist/docker_entrypoint.sh b/dist/docker_entrypoint.sh index cfa739c..7bd2d80 100644 --- a/dist/docker_entrypoint.sh +++ b/dist/docker_entrypoint.sh @@ -1,3 +1,3 @@ #!/bin/bash -e -export LIBP2P_ALLOW_WEAK_RSA_KEYS="" && export LIBP2P_SWARM_FD_LIMIT="10000" && ./libp2p-crawler \$@ +export LIBP2P_ALLOW_WEAK_RSA_KEYS="" && export LIBP2P_SWARM_FD_LIMIT="10000" && ./libp2p-crawler $@ diff --git a/start_crawl.sh b/start_crawl.sh deleted file mode 100644 index 0a00a00..0000000 --- a/start_crawl.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -e - -export LIBP2P_ALLOW_WEAK_RSA_KEYS="" && export LIBP2P_SWARM_FD_LIMIT="10000" && ./cmd/ipfs-crawler/ipfs-crawler \$@