diff --git a/docs/design/opstate.puml b/docs/design/opstate.puml new file mode 100644 index 000000000..eb6be73d4 --- /dev/null +++ b/docs/design/opstate.puml @@ -0,0 +1,61 @@ +@startuml +scale 600 width +[*] --> TrackedOp : track() + + TrackedOp: setCycle(cycle) + TrackedOp: setWaitTime(delay) + TrackedOp: start() + TrackedOp: + TrackedOp: setData(data) + TrackedOp: getData() + TrackedOp: skip(reason) + +State InProtocol { + + TrackedOp --> StartedOp : start() + + StartedOp: getCycle() + StartedOp: setData(data) + StartedOp: getData() + StartedOp: succeed(status) + StartedOp: + StartedOp: retry() + StartedOp: fail(status) + StartedOp: getStartedAtNanos() + StartedOp: getCurrentServiceTimeNanos() + StartedOp: getCurrentResponseTimeNanos() + + StartedOp -> StartedOp : retry() + + StartedOp --> SucceededOp : succeed() + SucceededOp: getCycle() + SucceededOp: getResult() + SucceededOp: getTries() + SucceededOp: getStartedAtNanos() + SucceededOp: getServiceTimeNanos() + SucceededOp: getResponseTimeNanos() + + StartedOp --> FailedOp: fail() + FailedOp: getCycle() + FailedOp: getResult() + FailedOp: getTries() + FailedOp: getStartedAtNanos() + FailedOp: getServiceTimeNanos() + FailedOp: getResponseTimeNanos() +} + +TrackedOp --> SkippedOp : skip() +SkippedOp: getSkippedReason() +SkippedOp: +SkippedOp: getCycle() +SkippedOp: getResult() +SkippedOp: getStartedAtNanos() +SkippedOp: getData() +SkippedOp: setData(data) + + +SucceededOp --> [*] +FailedOp --> [*] +SkippedOp --> [*] + +@enduml \ No newline at end of file diff --git a/docs/eb_iterates_cycles.puml b/docs/eb_iterates_cycles.puml new file mode 100644 index 000000000..15d22bfca --- /dev/null +++ b/docs/eb_iterates_cycles.puml @@ -0,0 +1,29 @@ +@startuml + +Participant Input as i +Participant Thread as t +Participant Action as a + +== acquire input == + +group TIMER read-input + t -> i : get segment(stride) + activate i + t <- i : [stride] + deactivate i +end + +group TIMER strides + + loop over cycle values in segment + group TIMER cycle & phase + t -> a : runCycle(cycle) + activate a + t <- a : result + deactivate a + end +end + +end # strides + +@enduml \ No newline at end of file diff --git a/docs/eb_iterates_cycles.svg b/docs/eb_iterates_cycles.svg new file mode 100644 index 000000000..3e7530892 --- /dev/null +++ b/docs/eb_iterates_cycles.svg @@ -0,0 +1,183 @@ + + + + + + + + + + + + + + + + + + + + + + Input + + + Input + + + Thread + + + Thread + + + Action + + + Action + + + + + + + + acquire input + + + + TIMER read-input + + + + get segment(stride) + + + + <cycle segment>[stride] + + + + TIMER strides + + + + loop + + [over cycle values in segment] + + + + TIMER cycle & phase + + + + runCycle(cycle) + + + + result + + + \ No newline at end of file diff --git a/docs/eb_iterates_phases.puml b/docs/eb_iterates_phases.puml new file mode 100644 index 000000000..ff3d1c82a --- /dev/null +++ b/docs/eb_iterates_phases.puml @@ -0,0 +1,42 @@ +@startuml + +Participant Input as i +Participant Thread as t +Participant Action as a + +== acquire input data == + +group TIMER read-input + t -> i : get segment(stride) + activate i + t <- i : [stride] + deactivate i +end + +group TIMER strides + + loop over cycle values in segment + group TIMER cycle + group TIMER phase + t -> a : runCycle(cycle) + activate a + t <- a : result + deactivate a + end + +== additional phases == + +group TIMER phase + loop until phases complete + t -> a : runPhase(cycle) + activate a + t <- a : result + deactivate a +end +end +end +end + +end # strides + +@enduml \ No newline at end of file diff --git a/docs/eb_iterates_phases.svg b/docs/eb_iterates_phases.svg new file mode 100644 index 000000000..91e0cebe9 --- /dev/null +++ b/docs/eb_iterates_phases.svg @@ -0,0 +1,55 @@ +InputInputThreadThreadActionActionacquire input dataTIMER read-inputget segment(stride)<cycle segment>[stride]TIMER stridesloop[over cycle values in segment]TIMER cycleTIMER phaserunCycle(cycle)resultadditional phasesTIMER phaseloop[until phases complete]runPhase(cycle)result \ No newline at end of file diff --git a/docs/eb_latency_details.puml b/docs/eb_latency_details.puml new file mode 100644 index 000000000..9081a4928 --- /dev/null +++ b/docs/eb_latency_details.puml @@ -0,0 +1,38 @@ +@startuml +Participant user as u +Participant client as c +Participant resource as cr +Participant transport as t +Participant server as s + +group responsetime + u -> c: request + activate c #Black + group waittime + c -> cr: wait + activate cr #Yellow + note left of cr: client\nwaits\nfor\nresource + cr -> c: + deactivate cr + end + + group servicetime + c ->> t: request + activate t #Red + group servertime + t ->> s: request + deactivate t + activate s #Blue + note right of s: server\nprocesses\nrequest + s ->> t: response + deactivate s + activate t #Red + end + t ->> c: response + deactivate t + end + c -> u: response + deactivate c +end + +@enduml \ No newline at end of file diff --git a/docs/eb_latency_terms.puml b/docs/eb_latency_terms.puml new file mode 100644 index 000000000..c7f6d336a --- /dev/null +++ b/docs/eb_latency_terms.puml @@ -0,0 +1,30 @@ +@startuml +Participant user as u +Participant client as c +Participant resource as cr +Participant server as s + +group responsetime + u -> c: request + activate c #Black +' note left of c: user\nwaits\nfor\nresponse + group waittime + c -> cr: wait + activate cr #Yellow + note right of cr: client\nwaits\nfor\nresource + cr -> c: + deactivate cr + end + + group servicetime + c ->> s: request + activate s #Blue + note right of s: server\nprocesses\nrequest + s ->> c: response + deactivate s + end + c -> u: response + deactivate c +end + +@enduml \ No newline at end of file diff --git a/docs/eb_latency_terms.svg b/docs/eb_latency_terms.svg new file mode 100644 index 000000000..4890a91f2 --- /dev/null +++ b/docs/eb_latency_terms.svg @@ -0,0 +1,42 @@ +useruserclientclientresourceresourceserverserverresponsetimerequestwaittimewaitclientwaitsforresourceservicetimerequestserverprocessesrequestresponseresponse \ No newline at end of file diff --git a/docs/hybrid_ratelimiter.md b/docs/hybrid_ratelimiter.md new file mode 100644 index 000000000..ff6c7c8a7 --- /dev/null +++ b/docs/hybrid_ratelimiter.md @@ -0,0 +1,175 @@ +## RateLimiter Design + +The nosqlbench rate limiter is based on a hybrid design, combining ideas +from well-known algorithms with a heavy dose of mechanical sympathy. The +resulting implementation provides the following: + +1. A basic design that can be explained in one page (this page!) +2. High throughput, compared to other rate limiters tested. +3. Graceful degradation with increasing concurrency. +4. Clearly defined behavioral semantics. +5. Efficient burst capability, for tunable catch-up rates. +6. Efficient calculation of wait time. + +## Parameters + +**op rate** - In simplest terms, users simply need to configure the *op rate*. +For example, `cyclerate=12000` specifies an op rate of 12000 per second. + +**burst rate** - Additionally, users may specify a burst rate which can be used +to recover unused time when a client is able to go faster than the strict +limit. The burst rate is multiplied by the _op rate_ to arrive at the maximum +rate when wait time is available to recover. For example, `cyclerate=12000,1.1` +specifies that a client may operate at 12000 ops/s _when it is caught up_, +while allowing it to go at a rate of up to 13200 ops/s _when it is behind +schedule_. + +## Design Principles + +The core design of the rate limiter is based on the [token +bucket](https://en.wikipedia.org/wiki/Token_bucket) algorithm as established in +the telecom industry for rate metering. Additional refinements have been +added to allow for flexible and reliable use on non-realtime systems. + +The unit of scheduling used in this design is the token, corresponding directly +to a nanosecond of time. The schedule time that is made available to callers is +stored in a pool of tokens which is set to a configured size. The size of the +token pool determines how many grants are allowed to be dispatched before the +next one is forced to wait for available tokens. + +At some regular frequency, a filler thread adds tokens (nanoseconds of time to +be distributed to waiting ops) to the pool. The callers which are waiting for +these tokens consume a number of tokens serially. If the pool does not contain +the requested number of tokens, then the caller is blocked using basic +synchronization primitives. When the pool is filled any blocked callers are +unblocked. + +## Design Details + +In fact, there are three pools. The _active_ pool, the _bursting_ pool, and the +_waiting_ pool. The active pool has a limited size based on the number of +operations that are allowed to be granted concurrently. + +The bursting pool is sized according to the relative burst rate and the size of the +active pool. For example, with an op rate of 1000 ops/s and a burst rate of 1.1, +the active pool can be sized to 1E9 nanos (one second of nanos), and the burst +pool can be sized to 1E8 (1/10 of that), thus yielding a combined pool size of +1E9 + 1E8, or 1100000000 ns. + +The waiting pool is where all extra tokens are held in reserve. It is unlimited +except by the size of a long value. The size of the waiting pool is a direct +measure of wait time in nanoseconds. + +Within the pools, no tokens are created nor destroyed. They are added by the +filler based on the passage of time, and consumed by callers when they become +available. In between these operations, the net sum of tokens is preserved. + +The filler thread adds tokens to the pool according to the system real-time +clock, at some estimated but unreliable interval. The frequency of filling is set +high enough to give a reliable perception of time passing smoothly, but low +enough to avoid wasting too much thread time in calling overhead. (It is set to +1K/s by default). Each time filling occurs, the real-time clock is +check-pointed, and the time delta is fed into the pool filling logic as +explained below. + +![RateLimiterDesign](hybrid_ratelimiter_sketch.png) + +During pool filling, the following steps are taken: +1) tokens are added to the active pool until it is full. +2) Any extra tokens are added to the waiting pool. +3) If the waiting pool has any tokens, and there is room in the bursting pool, + some tokens are moved from the waiting pool to the bursting pool. + +When a caller asks for a number of tokens, the combined total from the active +and burst pools is available to that caller. + +## Bursting Logic + +Tokens in the waiting pool represent time that has not been claimed by a caller. +Tokens accumulate in the waiting pool as a side-effect of continuous filling +outpacing continuous draining, thus creating a backlog of operations. The pool +sizes determine both the maximum instantaneously available operations as well as +the rate at which unclaimed time can be back-filled back into the active or +burst pools. + +### Normalizing for Jitter + +Since it is not possible to schedule the filler thread to trigger on a strict +and reliable schedule (as in a real-time system), the method of moving tokens +from the waiting pool to the bursting pool must account for differences in +timing. Thus, tokens which are activated for bursting are scaled according to +the amount of time added in the last fill, relative to the maximum active pool. +This means that a full pool fill will allow a full burst pool fill, presuming +wait time is positive by that amount. It also means that the same effect can be +achieved by ten consecutive fills of a tenth the time each. In effect, bursting +is normalized to the passage of time and the burst rate, with a maximum cap +imposed when operations are unclaimed by callers. + +## Mechanical Trade-offs + +In this implementation, it is relatively easy to explain how accuracy and +performance trade-off. They are competing concerns. Consider these two extremes +of an isochronous configuration: + +### Slow Isochronous + +For example, the rate limiter could be configured for strict isochronous +behavior by setting the active pool size to *one* op of nanos and the burst rate +to 1.0, thus disabling bursting. If the op rate requested is 1 op/s, this +configuration will work relatively well, although *any* caller which doesn't +show up (or isn't already waiting) when the tokens become available will incur a +waittime penalty. The odds of this are relatively low for a high-velocity +client. + +### Fast Isochronous + +However, if the op rate for this type of configuration is set to 1E8 operations +per second, then the filler thread will be adding 100 ops worth of time when +there is only *one* op worth of active pool space. This is due to the fact that +filling can only occur at a maximal frequency which has been set to 1K fills/s +on average. That will create artificial wait time, since the token consumers and +producers would not have enough pool space to hold the tokens needed during +fill. It is not possible on most systems to fill the pool at arbitrarily high +fill frequencies. Thus, it is important for users to understand the limits of +the machinery when using high rates. In most scenarios, these limits will not be +onerous. + +### Boundary Rules + +Taking these effects into account, the default configuration makes some +reasonable trade-offs according to the rules below. These rules should work well +for most rates below 50M ops/s. The net effect of these rules is to increase +work bulking within the token pools as rates go higher. + +Trying to go above 50M ops/s while also forcing isochronous behavior will result +in artificial wait-time. For this reason, the pool size itself is not +user-configurable at this time. + +- The pool size will always be at least as big as two ops. + This rule ensures that there is adequate buffer + space for tokens when callers are accessing the token pools near the rate of + the filler thread. If this were not ensured, then artificial wait time would + be injected due to actual overflow error. +- The pool size will always be at least as big as 1E6 nanos, or 1/1000 of a second. + This rule ensures that the filler thread has a reasonably attainable + update frequency which will prevent underflow in the active or burst pools. +- The number of ops that can fit in the pool will determine how many ops + can be dispatched between fills. For example, an op rate of 1E6 will mean + that up to 1000 ops worth of tokens may be present between fills, and + up to 1000 ops may be allowed to start at any time before the next fill. + +In practical terms, this means that rates slower than 1K ops/S will have their +strictness controlled by the burst rate in general, and rates faster than 1K +ops/S will automatically include some op bulking between fills. + +## History + +A CAS-oriented method which compensated for RTC calling overhead was used +previously. This method afforded very high performance, but it was difficult to +reason about. + +This implementation replaces that previous version. Basic synchronization +primitives (implicit locking via synchronized methods) performed surprisingly +well -- well enough to discard the complexity of the previous implementation. + +Further, this version is much easier to study and reason about. \ No newline at end of file diff --git a/docs/hybrid_ratelimiter.svg b/docs/hybrid_ratelimiter.svg new file mode 100644 index 000000000..143f9f89c --- /dev/null +++ b/docs/hybrid_ratelimiter.svg @@ -0,0 +1,1286 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + active pool waitingpool burstpool overflow refill consume + backfill 1 + 2 + 3 + 4 + + diff --git a/docs/hybrid_ratelimiter_sketch.png b/docs/hybrid_ratelimiter_sketch.png new file mode 100644 index 000000000..6f5d91c01 Binary files /dev/null and b/docs/hybrid_ratelimiter_sketch.png differ diff --git a/docs/hybrid_ratelimiter_sketch.svg b/docs/hybrid_ratelimiter_sketch.svg new file mode 100644 index 000000000..972ae1c05 --- /dev/null +++ b/docs/hybrid_ratelimiter_sketch.svg @@ -0,0 +1,578 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + active + burst + wait + + + + + + 1 + 2 + 3 + refill + + available + + + + + + in-reserve + + diff --git a/docs/hybrid_ratelimiter_sketch2.png b/docs/hybrid_ratelimiter_sketch2.png new file mode 100644 index 000000000..b9eb40d32 Binary files /dev/null and b/docs/hybrid_ratelimiter_sketch2.png differ diff --git a/docs/op_state_nomnoml.png b/docs/op_state_nomnoml.png new file mode 100644 index 000000000..fbeea86f5 Binary files /dev/null and b/docs/op_state_nomnoml.png differ diff --git a/docs/opstate_nomnoml.svg b/docs/opstate_nomnoml.svg new file mode 100644 index 000000000..2fed2a9a8 --- /dev/null +++ b/docs/opstate_nomnoml.svg @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +track +start +skip +retry +succeed +fail + +tracked + +started + +succeeded + +failed + +skipped \ No newline at end of file diff --git a/docs/parts.puml b/docs/parts.puml new file mode 100644 index 000000000..e69de29bb diff --git a/docs/rate_limiter_design.puml b/docs/rate_limiter_design.puml new file mode 100644 index 000000000..8a97ff202 --- /dev/null +++ b/docs/rate_limiter_design.puml @@ -0,0 +1,82 @@ +@startuml + +Participant "Calling\nThread" as t +Participant "Limiter\nlogic" as l +Participant "Allocated\nnanos" as a +Participant "Elapsed\nnanos" as e +Participant "Clock\nSource" as c + +t -> l : acquire(nanos) + +group allocate start time +l -> a : getAndIncrement(nanos) +activate a #black +note over l,a + **allocated** is an atomic accumulator + which represents scheduled time. Each + op causes it to be atomically incremented + by a time slice of nanos. +end note +a -> l : +deactivate a +end + +group calculate delay (cached) +l -> e : get() +activate e +note over e + **elapsed** is an + atomic register + which caches + system time. +end note +e -> l : +deactivate e +l -> l : delay = \nelapsed - scheduled_at + +note right + **delay** measures external delay + that causes an op to fire after + the ideal time. **positive delay** + thus means the rate limiter doesn't + need to impose its own blocking delay + in order to ensure delay>=0. +end note + +end + +group if delay<0 (cached) +note over l,c + If delay<0, then this operation is too soon according + to the cached clock value. Since this could be stale + and cause us to block needlessly, we update the cached + clock value and recompute delay. +end note +l -> c : get() (~25ns) +activate c #orange +c -> l : +deactivate c + +l -> e : store() +activate e #black +e -> l +deactivate e +l -> l : delay = \nelapsed - scheduled_at + +group if delay<0 (updated) + l->l: sleep(-delay);\ndelay=0 + note right + If delay is negative, we sleep + in the calling thread and + set delay=0 + end note + activate l + deactivate l +end + +end + +l->t: + + +@enduml \ No newline at end of file diff --git a/docs/ratelimiter_timelines.puml b/docs/ratelimiter_timelines.puml new file mode 100644 index 000000000..649d74a43 --- /dev/null +++ b/docs/ratelimiter_timelines.puml @@ -0,0 +1,47 @@ +@startuml +scale 100 as 100 pixels + +Title Rate Limiter - **Timelines** + +robust "typical" as W1 +@W1 +0 is past +100 is allocated #yellow +200 is scheduled #lightgreen +632 is now #lightblue +W1@100 <-> @200: schedule\ndelay +W1@200 <-> @632: + +robust "no waittime" as W2 +@W2 +0 is past +200 is allocated #yellow +200.000001 is scheduled #lightgreen +632 is now #lightblue + +robust "caughtup" as W3 +@W3 +0 is past +100 is allocated #yellow +200.000001 is scheduled #lightgreen +232 is now #lightblue + +robust "ahead" as W4 +@W4 +0 is past +100 is allocated #yellow +200.000001 is scheduled #lightgreen +232 is now #lightblue + +concise "perfect ops" as O +@O +0 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op +@enduml diff --git a/docs/research/multivariate.svg b/docs/research/multivariate.svg new file mode 100644 index 000000000..b08df1c38 --- /dev/null +++ b/docs/research/multivariate.svg @@ -0,0 +1,533 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + SCORE + value + throughputlatency + PARAMS + + + + + + + + + + + + + RESULT + ccphasyncthreads + + + PARAMS + + + + + VALUE + + + + + + + + resultfunctionR(P) + valuefunctionV(R) + + V(R(P)) + + diff --git a/docs/research/op_tracking.svg b/docs/research/op_tracking.svg new file mode 100644 index 000000000..1b11ddb02 --- /dev/null +++ b/docs/research/op_tracking.svg @@ -0,0 +1,607 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + optracker + + + + + + + + + + scenario + + + + + + + activity + + + + input + + + + asynccount + + + + stridetracker + + + + cyclesegment + + + + + + + thread + + + diff --git a/docs/testseq_early.png b/docs/testseq_early.png new file mode 100644 index 000000000..df14d6361 Binary files /dev/null and b/docs/testseq_early.png differ diff --git a/docs/testseq_early.puml b/docs/testseq_early.puml new file mode 100644 index 000000000..a3bc77d93 --- /dev/null +++ b/docs/testseq_early.puml @@ -0,0 +1,45 @@ +@startuml +scale 100 as 100 pixels + +Title Rate Limit - **EARLY** +concise "clock" as C +concise "elapsed" as E +concise "scheduled" as S +concise "allocated" as A + +C is the_past #red +E is elapsed #lightblue +S is scheduled #orange +A is allocated #yellow + +@C +732 is future #white +E->C + +@E +500 is unseen #white +@500 <-> @732: **error** = 232 +A -> C + +@S +0 is idle #grey +100 is scheduled #orange +600 is unscheduled #white +@500 <-> @600: **scheduling_delay** =\nelapsed - scheduled = -100 + +@A +300 is unallocated #white +@300 <-> @500: **wait_time** =\nelapsed - allocated = 200 + +concise "Ops" as O +@O +0 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op +@enduml diff --git a/docs/testseq_late.png b/docs/testseq_late.png new file mode 100644 index 000000000..95c894b08 Binary files /dev/null and b/docs/testseq_late.png differ diff --git a/docs/testseq_late.puml b/docs/testseq_late.puml new file mode 100644 index 000000000..f6f17e5a4 --- /dev/null +++ b/docs/testseq_late.puml @@ -0,0 +1,51 @@ +@startuml +scale 100 as 100 pixels + +Title Rate Limit **LATE** +concise "clock" as C +concise "elapsed" as E +concise "scheduled" as S +concise "allocated" as A + +C is the_past #red +E is NOWTIME #lightblue +S is scheduled #orange +A is allocated #yellow + +@0 +S is idle #grey + +@100 +A is unallocated #white +S is scheduled #orange + +@200 +S is unscheduled #white + +@500 +E is unseen #white +A -> C + +@632 +C is future #white +E->C + +@A +@100 <-> @500: **wait_time** =\nelapsed - allocated = 400 +@E +@500 <-> @632: **error** = 132 +@S +@200 <-> @500: **scheduling_delay** =\nelapsed - scheduled = 300 + +concise "Ops" as O +@O +0 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op ++100 is op +@enduml diff --git a/docs/todo.md b/docs/todo.md new file mode 100644 index 000000000..eae441c79 --- /dev/null +++ b/docs/todo.md @@ -0,0 +1,11 @@ +- convert core input to be equivalent of `input=type:interval,cycles:N[..M]` +- Add doc support for input, input filters, outputs, output filters +- Build metadata scaffolding for parameters, so unused parameters may be warned about. + - parameters should be part of the activity API + - parameters should not re-trigger def observers for non-change evhandler + - parameters across all subsystems should be discoverable or enumerable +- make stride auto-sizing uniformly apply after sequencing +- reimplement core activity and scenario logic as async/reactor with monitors +- convert to Java 9 +- add activity library commands +- add --list-input-filters and --list-output-filters \ No newline at end of file