apache · tom-pytel · Jul 7, 2021 · Jun 21, 2021 · Jun 28, 2021 · Jun 30, 2021
diff --git a/docs/EnvVars.md b/docs/EnvVars.md
@@ -26,3 +26,4 @@ Environment Variable | Description | Default
 | `SW_KAFKA_REPORTER_TOPIC_MANAGEMENT` | Specifying Kafka topic name for service instance reporting and registering. | `skywalking-managements` |
 | `SW_KAFKA_REPORTER_TOPIC_SEGMENT` | Specifying Kafka topic name for Tracing data. | `skywalking-segments` |
 | `SW_KAFKA_REPORTER_CONFIG_key` | The configs to init KafkaProducer. it support the basic arguments (whose type is either `str`, `bool`, or `int`) listed [here](https://kafka-python.readthedocs.io/en/master/apidoc/KafkaProducer.html#kafka.KafkaProducer) | unset |
+| `SW_CELERY_PARAMETERS_LENGTH`| The maximum length of `celery` functions parameters, longer than this will be truncated, 0 turns off  | `512` |
diff --git a/docs/Plugins.md b/docs/Plugins.md
@@ -3,7 +3,7 @@
 Library | Versions | Plugin Name
 | :--- | :--- | :--- |
 | [http.server](https://docs.python.org/3/library/http.server.html) | Python 3.5 ~ 3.9 | `sw_http_server` |
-| [urllib.request](https://docs.python.org/3/library/urllib.request.html) | Python 3.5 ~ 3.8 | `sw_urllib_request` |
+| [urllib.request](https://docs.python.org/3/library/urllib.request.html) | Python 3.5 ~ 3.9 | `sw_urllib_request` |
 | [requests](https://requests.readthedocs.io/en/master/) | >= 2.9.0 < 2.15.0, >= 2.17.0 <= 2.24.0 | `sw_requests` |
 | [Flask](https://flask.palletsprojects.com/en/1.1.x/) | >=1.0.4 <= 1.1.2 | `sw_flask` |
 | [PyMySQL](https://pymysql.readthedocs.io/en/latest/) | 0.10.0 | `sw_pymysql` |
@@ -18,6 +18,9 @@ Library | Versions | Plugin Name
 | [sanic](https://sanic.readthedocs.io/en/latest/) | >= 20.3.0 <= 20.9.1 | `sw_sanic` |
 | [aiohttp](https://sanic.readthedocs.io/en/latest/) | >= 3.7.3 | `sw_aiohttp` |
 | [pyramid](https://trypyramid.com) | >= 1.9 | `sw_pyramid` |
-| [psycopg2](https://www.psycopg.org/) | 2.8.6 | `sw_psycopg2` |
+| [psycopg2](https://www.psycopg.org/) | >= 2.8.6 | `sw_psycopg2` |
+| [celery](https://docs.celeryproject.org/) | >= 4.2.1 | `sw_celery` |
+
+* Note: The celery server running with "celery -A ..." should be run with the http protocol as it uses multiprocessing by default which is not compatible with the grpc protocol implementation in skywalking currently. Celery clients can use whatever protocol they want.
 
 The column `Versions` only indicates that the versions are tested, if you found the newer versions are also supported, welcome to add the newer version into the table.
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ aiofiles==0.6.0
 aiohttp==3.7.3
 attrs==19.3.0
 blindspin==2.0.1
+celery==4.4.7
 certifi==2020.6.20
 chardet==3.0.4
 click==7.1.2

diff --git a/setup.py b/setup.py
@@ -33,12 +33,13 @@
     author="Apache",
     author_email="dev@skywalking.apache.org",
     license="Apache 2.0",
-    packages=find_packages(exclude=("tests",)),
+    packages=find_packages(exclude=("tests", "tests.*")),
     include_package_data=True,
     install_requires=[
         "grpcio",
         "grpcio-tools",
         "packaging",
+        "requests",
         "wrapt",
     ],
     extras_require={

diff --git a/skywalking/__init__.py b/skywalking/__init__.py
@@ -42,6 +42,7 @@ class Component(Enum):
     AioHttp = 7008
     Pyramid = 7009
     Psycopg = 7010
+    Celery = 7011
 
 
 class Layer(Enum):

diff --git a/skywalking/agent/__init__.py b/skywalking/agent/__init__.py
@@ -16,6 +16,7 @@
 #
 
 import atexit
+import os
 from queue import Queue, Full
 from threading import Thread, Event
 from typing import TYPE_CHECKING
@@ -28,6 +29,11 @@
     from skywalking.trace.context import Segment
 
 
+__started = False
+__protocol = Protocol()  # type: Protocol
+__heartbeat_thread = __report_thread = __queue = __finished = None
+
+
 def __heartbeat():
     while not __finished.is_set():
         if connected():
@@ -39,21 +45,26 @@ def __heartbeat():
 def __report():
     while not __finished.is_set():
         if connected():
-            __protocol.report(__queue)  # is blocking actually
+            __protocol.report(__queue)  # is blocking actually, blocks for max config.QUEUE_TIMEOUT seconds
 
         __finished.wait(1)
 
 
-__heartbeat_thread = Thread(name='HeartbeatThread', target=__heartbeat, daemon=True)
-__report_thread = Thread(name='ReportThread', target=__report, daemon=True)
-__queue = Queue(maxsize=10000)
-__finished = Event()
-__protocol = Protocol()  # type: Protocol
-__started = False
+def __init_threading():
+    global __heartbeat_thread, __report_thread, __queue, __finished
+
+    __queue = Queue(maxsize=10000)
+    __finished = Event()
+    __heartbeat_thread = Thread(name='HeartbeatThread', target=__heartbeat, daemon=True)
+    __report_thread = Thread(name='ReportThread', target=__report, daemon=True)
+
+    __heartbeat_thread.start()
+    __report_thread.start()
 
 
 def __init():
     global __protocol
+
     if config.protocol == 'grpc':
         from skywalking.agent.protocol.grpc import GrpcProtocol
         __protocol = GrpcProtocol()
@@ -65,14 +76,40 @@ def __init():
         __protocol = KafkaProtocol()
 
     plugins.install()
+    __init_threading()
 
 
 def __fini():
     __protocol.report(__queue, False)
     __queue.join()
+    __finished.set()
+
+
+def __fork_before():
+    if config.protocol != 'http':
+        logger.warning('fork() not currently supported with %s protocol' % config.protocol)
 def run(self): 
     if agent.started() is False: 
         config.deserialize(self._sw_config) 
         agent.start() 
     super(SwProcess, self).run() 
 def run(self): 
     if agent.started() is False: 
         config.deserialize(self._sw_config) 
         agent.start() 
     super(SwProcess, self).run() 
+
+    # TODO: handle __queue and __finished correctly (locks, mutexes, etc...), need to lock before fork and unlock after
+    # if possible, or ensure they are not locked in threads (end threads and restart after fork?)
+
+    __protocol.fork_before()
+
+
+def __fork_after_in_parent():
+    __protocol.fork_after_in_parent()
+
+
+def __fork_after_in_child():
+    __protocol.fork_after_in_child()
+    __init_threading()
 
 
 def start():
+    global __started
+    if __started:
+        return
+    __started = True
+
     flag = False
     try:
         from gevent import monkey
@@ -82,22 +119,22 @@ def start():
     if flag:
         import grpc.experimental.gevent as grpc_gevent
         grpc_gevent.init_gevent()
-    global __started
-    if __started:
-        raise RuntimeError('the agent can only be started once')
+
     loggings.init()
     config.finalize()
-    __started = True
+
     __init()
-    __heartbeat_thread.start()
-    __report_thread.start()
+
     atexit.register(__fini)
 
+    if (hasattr(os, 'register_at_fork')):
+        os.register_at_fork(before=__fork_before, after_in_parent=__fork_after_in_parent,
+                            after_in_child=__fork_after_in_child)
+
 
 def stop():
     atexit.unregister(__fini)
     __fini()
-    __finished.set()
 
 
 def started():

diff --git a/skywalking/agent/protocol/__init__.py b/skywalking/agent/protocol/__init__.py
@@ -20,8 +20,17 @@
 
 
 class Protocol(ABC):
+    def fork_before(self):
+        pass
+
+    def fork_after_in_parent(self):
+        pass
+
+    def fork_after_in_child(self):
+        pass
+
     def connected(self):
-        raise NotImplementedError()
+        return False
 
     def heartbeat(self):
         raise NotImplementedError()

diff --git a/skywalking/agent/protocol/http.py b/skywalking/agent/protocol/http.py
@@ -17,7 +17,9 @@
 
 from skywalking.loggings import logger
 from queue import Queue, Empty
+from time import time
 
+from skywalking import config
 from skywalking.agent import Protocol
 from skywalking.client.http import HttpServiceManagementClient, HttpTraceSegmentReportService
 from skywalking.trace.segment import Segment
@@ -29,20 +31,27 @@ def __init__(self):
         self.service_management = HttpServiceManagementClient()
         self.traces_reporter = HttpTraceSegmentReportService()
 
+    def fork_after_in_child(self):
+        self.service_management.fork_after_in_child()
+        self.traces_reporter.fork_after_in_child()
+
+    def connected(self):
+        return True
+
     def heartbeat(self):
         if not self.properties_sent:
             self.service_management.send_instance_props()
             self.properties_sent = True
         self.service_management.send_heart_beat()
 
-    def connected(self):
-        return True
-
     def report(self, queue: Queue, block: bool = True):
+        start = time()
+
         def generator():
             while True:
                 try:
-                    segment = queue.get(block=block)  # type: Segment
+                    timeout = max(0, config.QUEUE_TIMEOUT - int(time() - start))  # type: int
+                    segment = queue.get(block=block, timeout=timeout)  # type: Segment
                 except Empty:
                     return
 
@@ -52,4 +61,7 @@ def generator():
 
                 queue.task_done()
 
-        self.traces_reporter.report(generator=generator())
+        try:
+            self.traces_reporter.report(generator=generator())
+        except Exception:
+            pass
diff --git a/skywalking/client/http.py b/skywalking/client/http.py
@@ -25,10 +25,14 @@
 
 class HttpServiceManagementClient(ServiceManagementClient):
     def __init__(self):
-        self.session = requests.session()
+        self.session = requests.Session()
+
+    def fork_after_in_child(self):
+        self.session.close()
+        self.session = requests.Session()
 
     def send_instance_props(self):
-        url = config.collector_address.rstrip('/') + '/v3/management/reportProperties'
+        url = 'http://' + config.collector_address.rstrip('/') + '/v3/management/reportProperties'
         res = self.session.post(url, json={
             'service': config.service_name,
             'serviceInstance': config.service_instance,
@@ -44,7 +48,7 @@ def send_heart_beat(self):
             config.service_name,
             config.service_instance,
         )
-        url = config.collector_address.rstrip('/') + '/v3/management/keepAlive'
+        url = 'http://' + config.collector_address.rstrip('/') + '/v3/management/keepAlive'
         res = self.session.post(url, json={
             'service': config.service_name,
             'serviceInstance': config.service_instance,
@@ -54,10 +58,14 @@ def send_heart_beat(self):
 
 class HttpTraceSegmentReportService(TraceSegmentReportService):
     def __init__(self):
-        self.session = requests.session()
+        self.session = requests.Session()
+
+    def fork_after_in_child(self):
+        self.session.close()
+        self.session = requests.Session()
 
     def report(self, generator):
-        url = config.collector_address.rstrip('/') + '/v3/segment'
+        url = 'http://' + config.collector_address.rstrip('/') + '/v3/segment'
         for segment in generator:
             res = self.session.post(url, json={
                 'traceId': str(segment.related_traces[0]),
@@ -76,10 +84,10 @@ def report(self, generator):
                     'componentId': span.component.value,
                     'isError': span.error_occurred,
                     'logs': [{
-                        'time': log.timestamp * 1000,
+                        'time': int(log.timestamp * 1000),
                         'data': [{
                             'key': item.key,
-                            'value': item.val
+                            'value': item.val,
                         } for item in log.items],
                     } for log in span.logs],
                     'tags': [{

diff --git a/skywalking/config.py b/skywalking/config.py
@@ -59,13 +59,14 @@
 kafka_bootstrap_servers = os.getenv('SW_KAFKA_REPORTER_BOOTSTRAP_SERVERS') or "localhost:9092"  # type: str
 kafka_topic_management = os.getenv('SW_KAFKA_REPORTER_TOPIC_MANAGEMENT') or "skywalking-managements"  # type: str
 kafka_topic_segment = os.getenv('SW_KAFKA_REPORTER_TOPIC_SEGMENT') or "skywalking-segments"  # type: str
+celery_parameters_length = int(os.getenv('SW_CELERY_PARAMETERS_LENGTH') or '512')
 
 
 def init(
         service: str = None,
         instance: str = None,
         collector: str = None,
-        protocol_type: str = 'grpc',
+        protocol_type: str = None,
         token: str = None,
 ):
     global service_name