update graph wheels to graph workload

This commit is contained in:
Jonathan Shook 2022-01-05 15:32:12 -06:00
parent b12e2afa66
commit d54e75ce63
30 changed files with 2982 additions and 41 deletions

View File

@ -0,0 +1,12 @@
# Baselines Version 1
In order to avoid changing those tests and possibly impacting
results without warning, the baseline sets are being kept
in separate directories.
## Status
This directory is for baselines version 1. The files in this directory
should not be modified arbitrarily. They need to remain stable so that
comparisons to previous results based on these workloads are still valid.

View File

@ -0,0 +1,107 @@
# nb -v run driver=cql yaml=cql-iot tags=phase:schema host=dsehost
description: An IOT workload with more optimal settings for DSE
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
bindings:
machine_id: Mod(<<sources:10000>>); ToHashedUUID() -> java.util.UUID
sensor_name: HashedLineToString('data/variable_words.txt')
time: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); ToDate()
cell_timestamp: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); Mul(1000L)
sensor_value: Normal(0.0,5.0); Add(100.0) -> double
station_id: Div(<<sources:10000>>);Mod(<<stations:100>>); ToHashedUUID() -> java.util.UUID
data: HashedFileExtractToString('data/lorem_ipsum_full.txt',800,1200)
blocks:
- tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table : |
create table if not exists <<keyspace:baselines>>.<<table:iot>> (
machine_id UUID, // source machine
sensor_name text, // sensor name
time timestamp, // timestamp of collection
sensor_value double, //
station_id UUID, // source location
data text,
PRIMARY KEY ((machine_id, sensor_name), time)
) WITH CLUSTERING ORDER BY (time DESC)
AND compression = { 'sstable_compression' : '<<compression:LZ4Compressor>>' }
AND nodesync={'enabled': 'true'}
AND compaction = {
'class': 'TimeWindowCompactionStrategy',
'compaction_window_size': <<expiry_minutes:60>>,
'compaction_window_unit': 'MINUTES',
'split_during_flush': true
};
tags:
name: create-table
- truncate-table: |
truncate table <<keyspace:baselines>>.<<table:iot>>;
tags:
name: truncate-table
- tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-rampup: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-rampup
- tags:
phase: verify
type: read
params:
ratio: 1
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-verify: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name} and time={time};
verify-fields: "*, -cell_timestamp"
tags:
name: select-verify
- tags:
phase: main
type: read
params:
ratio: <<read_ratio:1>>
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-read: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name}
limit <<limit:10>>
tags:
name: select-read
- tags:
phase: main
type: write
params:
ratio: <<write_ratio:9>>
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-main: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-main

View File

@ -0,0 +1,93 @@
---
title: CQL IoT
weight: 2
---
# CQL IoT
## Description
The CQL IoT workload demonstrates a time-series telemetry system as typically found in IoT applications. The bulk of the
traffic is telemetry ingest. This is useful for establishing steady-state capacity with an actively managed data
lifecycle. This is a steady-state workload, where inserts are 90% of the operations and queries are the remaining 10%.
## Named Scenarios
### default
The default scenario for cql-iot.yaml runs the conventional test phases: schema, rampup, main
## Testing Considerations
For in-depth testing, this workload will take some time to build up data density where TTLs begin purging expired data.
At this point, the test should be considered steady-state.
## Data Set
### baselines.iot dataset (rampup,main)
- machine_id - 1000 unique values
- sensor_name - 100 symbolic names, from a seed file
- time - monotonically increasing timestamp
- station_id - 100 unique values
- sensor_value - normal distribution, median 100, stddev 5.0
## Operations
### insert (rampup, main)
insert into baselines.iot
(machine_id, sensor_name, time, sensor_value, station_id)
values (?,?,?,?,?)
### query (main)
select * from baselines.iot
where machine_id=? and sensor_name=?
limit 10
## Workload Parameters
This workload has no adjustable parameters when used in the baseline tests.
When used for additional testing, the following parameters should be supported:
- machines - the number of unique sources (default: 1000)
- stations - the number of unique stations (default: 100)
- limit - the limit for rows in reads (default: 10)
- expiry_minutes - the TTL for data in minutes.
- compression - enabled or disabled, to disable, set compression=''
- write_cl - the consistency level for writes (default: LOCAL_QUORUM)
- read_cl - the consistency level for reads (defaultL LOCAL_QUORUM)
## Key Performance Metrics
Client side metrics are a more accurate measure of the system behavior from a user's perspective. For microbench and
baseline tests, these are the only required metrics. When gathering metrics from multiple server nodes, they should be
kept in aggregate form, for min, max, and average for each time interval in monitoring. For example, the avg p99 latency
for reads should be kept, as well as the min p99 latency for reads. If possible metrics, should be kept in plot form,
with discrete histogram values per interval.
### Client-Side
- read ops/s
- write ops/s
- read latency histograms
- write latency histograms
- exception counts
### Server-Side
- bytes compacted over time
- pending compactions
- active data on disk
- total data on disk
## Notes on Interpretation
- In order for this test to show useful performance contrasts, it has to be ramped to steady-state.
- Ingest of 1G rows yields an on-disk data density of 20.8 GB using default compression settings.

View File

@ -0,0 +1,140 @@
# nb -v run driver=cql yaml=cql-iot tags=phase:schema host=dsehost
description: |
Time-series data model and access patterns. (use cql-timeseries instead)
This is the same a cql-timeseries, which is the preferred name as it is
more canonical. This workload is retained for historic reasons.
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
params:
instrument: TEMPLATE(instrument,false)
bindings:
machine_id: Mod(<<sources:10000>>); ToHashedUUID() -> java.util.UUID
sensor_name: HashedLineToString('data/variable_words.txt')
time: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); ToDate()
cell_timestamp: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); Mul(1000L)
sensor_value: Normal(0.0,5.0); Add(100.0) -> double
station_id: Div(<<sources:10000>>);Mod(<<stations:100>>); ToHashedUUID() -> java.util.UUID
data: HashedFileExtractToString('data/lorem_ipsum_full.txt',800,1200)
blocks:
- tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table : |
create table if not exists <<keyspace:baselines>>.<<table:iot>> (
machine_id UUID, // source machine
sensor_name text, // sensor name
time timestamp, // timestamp of collection
sensor_value double, //
station_id UUID, // source location
data text,
PRIMARY KEY ((machine_id, sensor_name), time)
) WITH CLUSTERING ORDER BY (time DESC)
AND compression = { 'sstable_compression' : '<<compression:LZ4Compressor>>' }
AND compaction = {
'class': 'TimeWindowCompactionStrategy',
'compaction_window_size': <<expiry_minutes:60>>,
'compaction_window_unit': 'MINUTES'
};
tags:
name: create-table
- truncate-table: |
truncate table <<keyspace:baselines>>.<<table:iot>>;
tags:
name: truncate-table
- tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table-astra : |
create table if not exists <<keyspace:baselines>>.<<table:iot>> (
machine_id UUID, // source machine
sensor_name text, // sensor name
time timestamp, // timestamp of collection
sensor_value double, //
station_id UUID, // source location
data text,
PRIMARY KEY ((machine_id, sensor_name), time)
) WITH CLUSTERING ORDER BY (time DESC);
tags:
name: create-table-astra
- tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-rampup: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-rampup
params:
instrument: TEMPLATE(instrument-writes,TEMPLATE(instrument,false))
- tags:
phase: verify
type: read
params:
ratio: 1
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-verify: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name} and time={time};
verify-fields: "*, -cell_timestamp"
tags:
name: select-verify
params:
instrument: TEMPLATE(instrument-reads,TEMPLATE(instrument,false))
- tags:
phase: main
type: read
params:
ratio: <<read_ratio:1>>
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-read: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name}
limit <<limit:10>>
tags:
name: select-read
params:
instrument: TEMPLATE(instrument-reads,TEMPLATE(instrument,false))
- tags:
phase: main
type: write
params:
ratio: <<write_ratio:9>>
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-main: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-main
params:
instrument: TEMPLATE(instrument-writes,TEMPLATE(instrument,false))

View File

@ -0,0 +1,77 @@
---
title: CQL Key-Value
weight: 1
---
## Description
The CQL Key-Value workload demonstrates the simplest possible schema with payload data. This is useful for measuring
system capacity most directly in terms of raw operations. As a reference point, provides some insight around types of
workloads that are constrained around messaging, threading, and tasking, rather than bulk throughput.
During preload, all keys are set with a value. During the main phase of the workload, random keys from the known
population are replaced with new values which never repeat. During the main phase, random partitions are selected for
upsert, with row values never repeating.
## Operations
### insert (rampup, main)
insert into baselines.keyvalue (key, value) values (?,?);
### read (main)
select * from baselines.keyvalue where key=?key;
## Data Set
### baselines.keyvalue insert (rampup)
- key - text, number as string, selected sequentially up to keycount
- value - text, number as string, selected sequentially up to valuecount
### baselines.keyvalue insert (main)
- key - text, number as string, selected uniformly within keycount
- value - text, number as string, selected uniformly within valuecount
### baselines.keyvalue read (main)
- key - text, number as string, selected uniformly within keycount
## Workload Parameters
This workload has no adjustable parameters when used in the baseline tests.
When used for additional testing, the following parameters should be supported:
- keycount - the number of unique keys
- valuecount - the number of unique values
## Key Performance Metrics
Client side metrics are a more accurate measure of the system behavior from a user's perspective. For microbench and
baseline tests, these are the only required metrics. When gathering metrics from multiple server nodes, they should be
kept in aggregate form, for min, max, and average for each time interval in monitoring. For example, the avg p99 latency
for reads should be kept, as well as the min p99 latency for reads. If possible metrics, should be kept in plot form,
with discrete histogram values per interval.
### Client-Side
- read ops/s
- write ops/s
- read latency histograms
- write latency histograms
- exception counts
### Server-Side
- pending compactions
- bytes compacted
- active data on disk
- total data on disk
# Notes on Interpretation
Once the average ratio of overwrites starts to balance with the rate of compaction, a steady state should be achieved.
At this point, pending compactions and bytes compacted should be mostly flat over time.

View File

@ -0,0 +1,102 @@
description: A workload with only text keys and text values
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
bindings:
seq_key: Mod(<<keycount:1000000000>>); ToString() -> String
seq_value: Hash(); Mod(<<valuecount:1000000000>>); ToString() -> String
rw_key: <<keydist:Uniform(0,1000000000)->int>>; ToString() -> String
rw_value: Hash(); <<valdist:Uniform(0,1000000000)->int>>; ToString() -> String
blocks:
- name: schema
tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:keyvalue>> (
key text,
value text,
PRIMARY KEY (key)
);
tags:
name: create-table
- name: schema-astra
tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:keyvalue>> (
key text,
value text,
PRIMARY KEY (key)
);
tags:
name: create-table-astra
- name: rampup
tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- rampup-insert: |
insert into <<keyspace:baselines>>.<<table:keyvalue>>
(key, value)
values ({seq_key},{seq_value});
tags:
name: rampup-insert
- name: verify
tags:
phase: verify
type: read
params:
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- verify-select: |
select * from <<keyspace:baselines>>.<<table:keyvalue>> where key={seq_key};
verify-fields: key->seq_key, value->seq_value
tags:
name: verify
- name: main-read
tags:
phase: main
type: read
params:
ratio: 5
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- main-select: |
select * from <<keyspace:baselines>>.<<table:keyvalue>> where key={rw_key};
tags:
name: main-select
- name: main-write
tags:
phase: main
type: write
params:
ratio: 5
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- main-insert: |
insert into <<keyspace:baselines>>.<<table:keyvalue>>
(key, value) values ({rw_key}, {rw_value});
tags:
name: main-insert

View File

@ -0,0 +1,112 @@
description: A tabular workload with partitions, clusters, and data fields
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
bindings:
# for ramp-up and verify
part_layout: Div(<<partsize:1000000>>); ToString() -> String
clust_layout: Mod(<<partsize:1000000>>); ToString() -> String
data: HashedFileExtractToString('data/lorem_ipsum_full.txt',50,150)
# for read
limit: Uniform(1,10) -> int
part_read: Uniform(0,<<partcount:100>>)->int; ToString() -> String
clust_read: Add(1); Uniform(0,<<partsize:1000000>>)->int; ToString() -> String
# for write
part_write: Hash(); Uniform(0,<<partcount:100>>)->int; ToString() -> String
clust_write: Hash(); Add(1); Uniform(0,<<partsize:1000000>>)->int; ToString() -> String
data_write: Hash(); HashedFileExtractToString('data/lorem_ipsum_full.txt',50,150) -> String
blocks:
- name: schema
tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:tabular>> (
part text,
clust text,
data text,
PRIMARY KEY (part,clust)
);
tags:
name: create-table
- name: schema-astra
tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:tabular>> (
part text,
clust text,
data text,
PRIMARY KEY (part,clust)
);
tags:
name: create-table-astra
- name: rampup
tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- rampup-insert: |
insert into <<keyspace:baselines>>.<<table:tabular>>
(part,clust,data)
values ({part_layout},{clust_layout},{data})
tags:
name: rampup-insert
- name: verify
tags:
phase: verify
type: read
params:
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- verify-select: |
select * from <<keyspace:baselines>>.<<table:tabular>> where part={part_layout} and clust={clust_layout}
tags:
name: verify-select
- name: main-read
tags:
phase: main
type: read
params:
ratio: 5
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- main-select: |
select * from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select
- name: main-write
tags:
phase: main
type: write
params:
ratio: 5
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- main-write: |
insert into <<keyspace:baselines>>.<<table:tabular>>
(part, clust, data)
values ({part_write},{clust_write},{data_write})
tags:
name: main-write

View File

@ -0,0 +1,86 @@
---
title: CQL Wide Rows
weight: 3
---
## Description
The CQL Wide Rows workload provides a way to tax a system with wide rows of a given size. This is useful to help
understand underlying performance differences between version and configuration options when using data models that have
wide rows.
For in-depth testing, this workload needs significant density of partitions in combination with fully populated wide
rows. For exploratory or parameter contrasting tests, ensure that the rampup phase is configured correctly to establish
this initial state.
## Data Set
### baselines.widerows dataset (rampup)
- part - text, number in string form, sequentially from 1..1E9
- clust - text, number in string form, sequentially from 1..1E9
- data - text, extract from lorem ipsum between 50 and 150 characters
### baselines.widerows dataset (main)
- part - text, number in string form, sequentially from 1..1E9
- clust - text, number in string form, sequentially from 1..<partsize>
- data - text, extract from lorem ipsum between 50 and 150 characters
- machine_id - 1000 unique values
- sensor_name - 100 symbolic names, from a seed file
- time - monotonically increasing timestamp
- station_id - 100 unique values
- sensor_value - normal distribution, median 100, stddev 5.0
## Operations
### insert (rampup, main)
insert into baselines.iot
(machine_id, sensor_name, time, sensor_value, station_id)
values (?,?,?,?,?)
### query (main)
select * from baselines.iot
where machine_id=? and sensor_name=?
limit 10
## Workload Parameters
This workload has no adjustable parameters when used in the baseline tests.
When used for additional testing, the following parameters should be supported:
- partcount - the number of unique partitions
- partsize - the number of logical rows within a CQL partition
## Key Performance Metrics
Client side metrics are a more accurate measure of the system behavior from a user's perspective. For microbench and
baseline tests, these are the only required metrics. When gathering metrics from multiple server nodes, they should be
kept in aggregate form, for min, max, and average for each time interval in monitoring. For example, the avg p99 latency
for reads should be kept, as well as the min p99 latency for reads. If possible metrics, should be kept in plot form,
with discrete histogram values per interval.
### Client-Side
- read ops/s
- write ops/s
- read latency histograms
- write latency histograms
- exception counts
### Server-Side
- bytes compacted over time
- pending compactions
- active data on disk
- total data on disk
## Notes on Interpretation

View File

@ -0,0 +1,121 @@
description: |
This is a workload which creates an incrementally growing dataset over cycles.
Rows will be added incrementally in both rampup and main phases. However, during
the main phase, reads will also occur at the same rate, with the read patterns
selecting from the size of data written up to that point.
In order to ensure that the reads and writes operate against the same set of
identifiers, it is crucial that the ratios are not adjusted unless the binding
recipes are adjusted to match. With write:read ratio of 1:1 and a prefix function
Div(2L) at the front of the main phase bindings, the writes and reads will address
the same rows rather than playing leap-frog on the cycle values.
The main phase can be run without the rampup phase for this workload, as long
as your test is defined as an incremental write and read test. If you need
background data pre-loaded to ensure realistic read times against pre-indexed
data, then you may use the rampup phase before the main phase. However, be aware
that these are simply different test definitions, and are both valid in different ways.
Due to how this workload is meant to be used, you must specify main-cycles= when
invoking the main phase.
The cycles value for the main test includes operations for both writes and reads,
thus the logical number of rows in the dataset will be effectively half of that.
This workload is intended to be run with a sufficiently high number of cycles.
Two key details should be obvious in the read latency metrics -- 1) the relationship
between dataset size, request rate, and response times and 2) inflection points
between any hot and cold access modes for LRU or other caching mechanisms as
the primary cache layer is saturated.
scenarios:
default:
schema: run tags=phase:schema threads==1
# rampup: run tags=phase:rampup cycles===TEMPLATE(rampup-cycles,100000) threads=auto
main: run tags=phase:main cycles===TEMPLATE(main-cycles,0) threads=auto
default-schema: run tags=phase:schema threads==1
# default-rampup: run tags=phase:rampup cycles===TEMPLATE(rampup-cycles,100000) threads=auto
default-main: run tags=phase:main cycles===TEMPLATE(main-cycles,0) threads=auto
astra:
schema: run tags=phase:astra-schema threads==1
# rampup: run tags=phase:rampup cycles===TEMPLATE(rampup-cycles,0) threads=auto
main: run tags=phase:main cycles===TEMPLATE(main-cycles,0) threads=auto
params:
instrument: true
bindings:
seq_key: ToString()
rampup_value: Hash(); ToString();
read_key: Div(2L); HashRangeScaled(TEMPLATE(scalefactor,1.0d)); ToString();
read_value: Div(2L); HashRangeScaled(TEMPLATE(scalefactor,1.0d)); Hash(); ToString();
write_key: Div(2L); Hash(); HashRangeScaled(TEMPLATE(scalefactor,1.0d)); ToString();
write_value: Div(2L); Hash(); HashRangeScaled(TEMPLATE(scalefactor,1.0d)); Hash(); ToString();
blocks:
- name: schema
tags:
phase: schema
statements:
- create-keyspace: |
create keyspace if not exists TEMPLATE(keyspace,baselines)
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 'TEMPLATE(rf,1)'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table: |
create table if not exists TEMPLATE(keyspace,baselines).TEMPLATE(table,incremental) (
key text,
value text,
PRIMARY KEY (key)
);
- name: schema-astra
tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table: |
create table if not exists TEMPLATE(keyspace,baselines).TEMPLATE(table,incremental) (
key text,
value text,
PRIMARY KEY (key)
);
tags:
name: create-table-astra
- name: rampup
tags:
phase: rampup
params:
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
statements:
- rampup-insert: |
insert into TEMPLATE(keyspace,baselines).TEMPLATE(table,incremental)
(key, value)
values ({rampup_key},{rampup_value});
tags:
name: rampup-insert
- name: main-read
tags:
phase: main
type: read
params:
ratio: 1
cl: TEMPLATE(read_cl,LOCAL_QUORUM)
statements:
- main-select: |
select * from TEMPLATE(keyspace,baselines).TEMPLATE(table,incremental) where key={read_key};
tags:
name: main-select
- name: main-write
tags:
phase: main
type: write
params:
ratio: 1
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
statements:
- main-insert: |
insert into TEMPLATE(keyspace,baselines).TEMPLATE(table,incremental)
(key, value) values ({write_key}, {write_value});
tags:
name: main-insert

View File

@ -0,0 +1,18 @@
# Baselines Version 2
In order to avoid changing those tests and possibly impacting
results without warning, the baseline sets are being kept
in separate directories.
## Status
This directory is for baselines version 2. These files are the current
in-development set of baselines, and may change in minor ways, or have
additional workloads added, for example. If you are performing baselines
over a period of time and need the workloads to be perfectly stable,
it is best to copy these to your test assets under a distinct name and
call them from there.
To further disambiguate the workloads, each one has a version '2'
appended to the filename.

View File

@ -0,0 +1,109 @@
description: |
A workload with only text keys and text values.
The CQL Key-Value workload demonstrates the simplest possible schema with payload data. This is useful for measuring
system capacity most directly in terms of raw operations. As a reference point, it provides some insight around types of
workloads that are constrained around messaging, threading, and tasking, rather than bulk throughput.
During preload, all keys are set with a value. During the main phase of the workload, random keys from the known
population are replaced with new values which never repeat. During the main phase, random partitions are selected for
upsert, with row values never repeating.
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
bindings:
seq_key: Mod(<<keycount:1000000000>>); ToString() -> String
seq_value: Hash(); Mod(<<valuecount:1000000000>>); ToString() -> String
rw_key: <<keydist:Uniform(0,1000000000)->int>>; ToString() -> String
rw_value: Hash(); <<valdist:Uniform(0,1000000000)->int>>; ToString() -> String
blocks:
- name: schema
tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:keyvalue>> (
key text,
value text,
PRIMARY KEY (key)
);
tags:
name: create-table
- name: schema-astra
tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:keyvalue>> (
key text,
value text,
PRIMARY KEY (key)
);
tags:
name: create-table-astra
- name: rampup
tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- rampup-insert: |
insert into <<keyspace:baselines>>.<<table:keyvalue>>
(key, value)
values ({seq_key},{seq_value});
tags:
name: rampup-insert
- name: verify
tags:
phase: verify
type: read
params:
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- verify-select: |
select * from <<keyspace:baselines>>.<<table:keyvalue>> where key={seq_key};
verify-fields: key->seq_key, value->seq_value
tags:
name: verify
- name: main-read
tags:
phase: main
type: read
params:
ratio: 5
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- main-select: |
select * from <<keyspace:baselines>>.<<table:keyvalue>> where key={rw_key};
tags:
name: main-select
- name: main-write
tags:
phase: main
type: write
params:
ratio: 5
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- main-insert: |
insert into <<keyspace:baselines>>.<<table:keyvalue>>
(key, value) values ({rw_key}, {rw_value});
tags:
name: main-insert

View File

@ -0,0 +1,176 @@
description: |
A tabular workload with partitions, clusters, and data fields
This workload contains partitioning and cluster along with a set
of 8 fields of varying length. The field values vary in size according
to the fibonacci sequence times a base size factor of 10, with
an additional 10% variance for each field.
The read patterns have a variety of field subsets specified.
During rampup, all rows will be written partition by partition,
filling in all rows of that partition before moving on to the next.
Example: With a partition size of 1000 and 1B rows, there will be
1000000 partitions.
During main phase, the read patterns are varied with different
field sets. As well, the number of rows which will be returned
is varied betweeen 1 and 10.
By default, reads occur at the same ratio as writes, with main
phase writes writing full rows.
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10B) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,100M) threads=auto
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10B) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,100M) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
params:
instrument: true
bindings:
# for ramp-up and verify phases
#
part_layout: Div(<<partsize:1000>>); ToString() -> String
clust_layout: Mod(<<partsize:1000>>); ToString() -> String
# todo: update these definitions to use the simpler 10,0.1, 20, 0.2, ...
data0: Add(10); HashedFileExtractToString('data/lorem_ipsum_full.txt',9,11)
data1: Add(20); HashedFileExtractToString('data/lorem_ipsum_full.txt',18,22)
data2: Add(30); HashedFileExtractToString('data/lorem_ipsum_full.txt',27,33)
data3: Add(40); HashedFileExtractToString('data/lorem_ipsum_full.txt',45,55)
data4: Add(50); HashedFileExtractToString('data/lorem_ipsum_full.txt',72,88)
data5: Add(60); HashedFileExtractToString('data/lorem_ipsum_full.txt',107,143)
data6: Add(70); HashedFileExtractToString('data/lorem_ipsum_full.txt',189,231)
data7: Add(80); HashedFileExtractToString('data/lorem_ipsum_full.txt',306,374)
# for main phase
# for write
part_write: Hash(); Uniform(0,<<partcount:100>>)->int; ToString() -> String
clust_write: Hash(); Add(1); Uniform(0,<<partsize:1000000>>)->int; ToString() -> String
data_write: Hash(); HashedFileExtractToString('data/lorem_ipsum_full.txt',50,150) -> String
# for read
limit: Uniform(1,10) -> int
part_read: Uniform(0,<<partcount:100>>)->int; ToString() -> String
clust_read: Add(1); Uniform(0,<<partsize:1000000>>)->int; ToString() -> String
blocks:
- name: schema
tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:tabular>> (
part text,
clust text,
data0 text, data1 text, data2 text, data3 text,
data4 text, data5 text, data6 text, data7 text,
PRIMARY KEY (part,clust)
);
tags:
name: create-table
- name: schema-astra
tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table: |
create table if not exists <<keyspace:baselines>>.<<table:tabular>> (
part text,
clust text,
data0 text, data1 text, data2 text, data3 text,
data4 text, data5 text, data6 text, data7 text,
PRIMARY KEY (part,clust)
);
tags:
name: create-table-astra
- name: rampup
tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- rampup-insert: |
insert into <<keyspace:baselines>>.<<table:tabular>>
(part,clust,data0,data1,data2,data3,data4,data5,data6,data7)
values ({part_layout},{clust_layout},{data0},{data1},{data2},{data3},{data4},{data5},{data6},{data7})
tags:
name: rampup-insert
- name: verify
tags:
phase: verify
type: read
params:
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- verify-select: |
select * from <<keyspace:baselines>>.<<table:tabular>> where part={part_layout} and clust={clust_layout}
tags:
name: verify-select
- name: main-read
tags:
phase: main
type: read
params:
ratio: 1
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- main-select-all: |
select * from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-01
- main-select-01: |
select data0,data1 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-0246
- main-select-0246: |
select data0,data2,data4,data6 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-1357
- main-select-1357: |
select data1,data3,data5,data7 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-0123
- main-select-0123: |
select data0,data1,data2,data3 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-4567
- main-select-4567: |
select data4,data5,data6,data7 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select-01234567
- main-select: |
select data0,data1,data2,data3,data4,data5,data6,data7 from <<keyspace:baselines>>.<<table:tabular>> where part={part_read} limit {limit};
tags:
name: main-select
- name: main-write
tags:
phase: main
type: write
params:
ratio: 8
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- main-write: |
insert into <<keyspace:baselines>>.<<table:tabular>>
(part, clust, data0,data1,data2,data3,data4,data5,data6,data7)
values ({part_write},{clust_write},{data0},{data1},{data2},{data3},{data4},{data5},{data6},{data7})
tags:
name: main-write

View File

@ -0,0 +1,138 @@
# nb -v run driver=cql yaml=cql-iot tags=phase:schema host=dsehost
description: |
This workload emulates a time-series data model and access patterns.
scenarios:
default:
schema: run driver=cql tags==phase:schema threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
astra:
schema: run driver=cql tags==phase:schema-astra threads==1 cycles==UNDEF
rampup: run driver=cql tags==phase:rampup cycles===TEMPLATE(rampup-cycles,10000000) threads=auto
main: run driver=cql tags==phase:main cycles===TEMPLATE(main-cycles,10000000) threads=auto
params:
instrument: TEMPLATE(instrument,false)
bindings:
machine_id: Mod(<<sources:10000>>); ToHashedUUID() -> java.util.UUID
sensor_name: HashedLineToString('data/variable_words.txt')
time: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); ToDate()
cell_timestamp: Mul(<<timespeed:100>>L); Div(<<sources:10000>>L); Mul(1000L)
sensor_value: Normal(0.0,5.0); Add(100.0) -> double
station_id: Div(<<sources:10000>>);Mod(<<stations:100>>); ToHashedUUID() -> java.util.UUID
data: HashedFileExtractToString('data/lorem_ipsum_full.txt',800,1200)
blocks:
- tags:
phase: schema
params:
prepared: false
statements:
- create-keyspace: |
create keyspace if not exists <<keyspace:baselines>>
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '<<rf:1>>'}
AND durable_writes = true;
tags:
name: create-keyspace
- create-table : |
create table if not exists <<keyspace:baselines>>.<<table:iot>> (
machine_id UUID, // source machine
sensor_name text, // sensor name
time timestamp, // timestamp of collection
sensor_value double, //
station_id UUID, // source location
data text,
PRIMARY KEY ((machine_id, sensor_name), time)
) WITH CLUSTERING ORDER BY (time DESC)
AND compression = { 'sstable_compression' : '<<compression:LZ4Compressor>>' }
AND compaction = {
'class': 'TimeWindowCompactionStrategy',
'compaction_window_size': <<expiry_minutes:60>>,
'compaction_window_unit': 'MINUTES'
};
tags:
name: create-table
- truncate-table: |
truncate table <<keyspace:baselines>>.<<table:iot>>;
tags:
name: truncate-table
- tags:
phase: schema-astra
params:
prepared: false
statements:
- create-table-astra : |
create table if not exists <<keyspace:baselines>>.<<table:iot>> (
machine_id UUID, // source machine
sensor_name text, // sensor name
time timestamp, // timestamp of collection
sensor_value double, //
station_id UUID, // source location
data text,
PRIMARY KEY ((machine_id, sensor_name), time)
) WITH CLUSTERING ORDER BY (time DESC);
tags:
name: create-table-astra
- tags:
phase: rampup
params:
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-rampup: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-rampup
params:
instrument: TEMPLATE(instrument-writes,TEMPLATE(instrument,false))
- tags:
phase: verify
type: read
params:
ratio: 1
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-verify: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name} and time={time};
verify-fields: "*, -cell_timestamp"
tags:
name: select-verify
params:
instrument: TEMPLATE(instrument-reads,TEMPLATE(instrument,false))
- tags:
phase: main
type: read
params:
ratio: <<read_ratio:1>>
cl: <<read_cl:LOCAL_QUORUM>>
statements:
- select-read: |
select * from <<keyspace:baselines>>.<<table:iot>>
where machine_id={machine_id} and sensor_name={sensor_name}
limit <<limit:10>>
tags:
name: select-read
params:
instrument: TEMPLATE(instrument-reads,TEMPLATE(instrument,false))
- tags:
phase: main
type: write
params:
ratio: <<write_ratio:9>>
cl: <<write_cl:LOCAL_QUORUM>>
statements:
- insert-main: |
insert into <<keyspace:baselines>>.<<table:iot>>
(machine_id, sensor_name, time, sensor_value, station_id, data)
values ({machine_id}, {sensor_name}, {time}, {sensor_value}, {station_id}, {data})
using timestamp {cell_timestamp}
idempotent: true
tags:
name: insert-main
params:
instrument: TEMPLATE(instrument-writes,TEMPLATE(instrument,false))

View File

@ -0,0 +1,39 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/date.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{fullname}\n"
bindings:
# All uncommented lines under this are indented, so they become named bindings below
# the entry above
# Normally, the value that you get with a cycle starts at 0.
cycleNum: Identity();
# here we convert the cycle number to a Date by casting.
id: Identity(); ToDate();
# Date during 2017 (number of milliseconds in a year: 31,536,000,000)
date: StartingEpochMillis('2017-01-01 23:59:59'); AddHashRange(0L,31536000000L); StringDateWrapper("YYYY-MM-dd")
# Example output:
# date : 2017-09-17
# date : 2017-08-01
# date : 2017-04-22
# date : 2017-04-09
# date : 2017-05-28
# date : 2017-08-06
# date : 2017-07-05
# date : 2017-02-07
# date : 2017-05-25
# date : 2017-12-02

View File

@ -0,0 +1,28 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/expr.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{fullname}\n"
bindings:
# flight times based on hour / minute / second computation
hour: HashRange(0,2); ToInt()
minute: Shuffle(0,2); ToInt()
second: HashRange(0,60); ToInt()
flightDate: HashRange(0,2); Mul(3600000); Save('hour'); Shuffle(0,2); Mul(60000); Save('minute'); HashRange(0,60); Mul(1000); Save('second'); Expr('hour + minute + second'); StartingEpochMillis('2018-10-02 04:00:00'); ToDate(); ToString()
flightDateFixed: Save('cycle'); HashRange(0,2); Mul(3600000); Load('cycle'); Save('hour'); Shuffle(0,2); Mul(60000); Save('minute'); Load('cycle'); HashRange(0,60); Mul(1000); Save('second'); Expr('hour + minute + second'); StartingEpochMillis('2018-10-02 04:00:00'); ToDate(); ToString()
flightDateLong: Save('cycle'); HashRange(0,2); Mul(3600000); Load('cycle'); Save('hour'); Shuffle(0,2); Mul(60000); Save('minute'); Load('cycle'); HashRange(0,60); Mul(1000); Save('second'); Expr('hour + minute + second'); ToString()
# status that depends on score
riskScore: Normal(0.0,5.0); Clamp(1, 100); Save('riskScore') -> int
status: |
Expr('riskScore > 90 ? 0 : 1') -> long; ToBoolean(); ToString()
status_2: |
ToInt(); Expr('riskScore >90 ? 0 : 1') -> int; WeightedStrings('accepted:1;rejected:1')

View File

@ -0,0 +1,172 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/text.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{fullname}\n"
bindings:
# All uncommented lines under this are indented, so they become named bindings below
# the entry above
# Normally, the value that you get with a cycle starts at 0.
cycleNum: Identity();
# here we convert the cycle number to a text by casting.
id: Identity(); ToString()
## Names
# See http://docs.virtdata.io/functions/funcref_premade/
# Full name
fullname: FullNames()
# Example output:
# fullname : Norman Wolf
# fullname : Lisa Harris
# fullname : John Williams
# fullname : Freda Gaytan
# fullname : Violet Ferguson
# fullname : Larry Roberts
# fullname : Andrew Daniels
# fullname : Jean Keys
# fullname : Mark Cole
# fullname : Roberta Bounds
# Name with last name first
fullname_lastname_first: Template('{}, {}', LastNames(), FirstNames())
# Example output:
# fullname_lastname_first : Miracle, Lisa
# fullname_lastname_first : Wolf, John
# fullname_lastname_first : Harris, Freda
# fullname_lastname_first : Williams, Violet
# fullname_lastname_first : Gaytan, Larry
# fullname_lastname_first : Ferguson, Andrew
# fullname_lastname_first : Roberts, Jean
# fullname_lastname_first : Daniels, Mark
# fullname_lastname_first : Keys, Roberta
# fullname_lastname_first : Cole, Timothy
# Phone
phone: compose HashRange(10000000000L,99999999999L); Combinations('0-9;0-9;0-9;-;0-9;0-9;0-9;-;0-9;0-9;0-9;0-9')
# Example output:
# $ ebdse run driver=stdout yaml=example-bindings format=readout cycles=10
# phone : 241-478-6787
# phone : 784-482-7668
# phone : 804-068-5502
# phone : 044-195-5579
# phone : 237-202-5601
# phone : 916-390-8911
# phone : 550-943-7851
# phone : 762-031-1362
# phone : 234-050-2563
# phone : 312-672-0039
## Career
career: HashedLineToString('data/careers.txt')
# Example output:
# career : Paper Goods Machine Setters, Operators, and Tenders
# career : Training and Development Specialists
# career : Embossing Machine Set-Up Operators
# career : Airframe-and-Power-Plant Mechanics
# career : Sales Representatives, Agricultural
# career : Automotive Body and Related Repairers
# career : Community Health Workers
# career : Billing, Posting, and Calculating Machine Operators
# career : Data Processing Equipment Repairers
# career : Sawing Machine Setters and Set-Up Operators
## Job Description
jobdescription: Add(0); HashedLineToString('data/jobdescription.txt')
# Example output:
# jobdescription: Add(0); HashedLineToString('data/jobdescription.txt')
## Weighted enumerated values
# Sorting hat (even distribution)
house: WeightedStrings('Gryffindor:0.2;Hufflepuff:0.2;Ravenclaw:0.2;Slytherin:0.2')
# Example output:
# house : Hufflepuff
# house : Ravenclaw
# house : Slytherin
# house : Slytherin
# house : Gryffindor
# house : Hufflepuff
# house : Ravenclaw
# house : Ravenclaw
# house : Hufflepuff
# house : Hufflepuff
## Weighted prefixes
prefix: WeightedStrings('Mr:0.45;Mrs:0.25;Ms:0.1;Miss:0.1;Dr:0.05')
# Example output:
# prefix : Mr
# prefix : Mrs
# prefix : Miss
# prefix : Miss
# prefix : Mr
# prefix : Mrs
# prefix : Mrs
# prefix : Mrs
# prefix : Mr
# prefix : Mr
# prefix : Mr
# prefix : Mr
# prefix : Mrs
# prefix : Mrs
# prefix : Mr
# prefix : Mr
# prefix : Mrs
# prefix : Miss
# prefix : Ms
# prefix : Dr
## Current Employer
current_employer: HashedLineToString('data/companies.txt')
# Example output:
# current_employer : Monsanto Company
# current_employer : International Flavors & Fragrances
# current_employer : Carpenter Technology Corporation
# current_employer : Union Pacific Corporation
# current_employer : Rush Enterprises
# current_employer : Peabody Energy Corporation
# current_employer : Rockwell Automation
# current_employer : Auto-Owners Insurance Group
# current_employer : ArcBest Corporation
# current_employer : WGL Holdings
## Sensor
sensor_name: HashedLineToString('data/variable_words.txt')
# Example output:
# sensor_name : rotational_latency
# sensor_name : half_life
# sensor_name : clarity
# sensor_name : fairness
# sensor_name : diversity
# sensor_name : turbulence
# sensor_name : mode
# sensor_name : current
# sensor_name : rating
# sensor_name : stall_speed

View File

@ -0,0 +1,72 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/timestamp.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{epochMillis}\n"
bindings:
# All uncommented lines under this are indented, so they become named bindings below
# the entry above
# Normally, the value that you get with a cycle starts at 0.
cycleNum: Identity();
# So far, we've only been dealing in milliseconds. This is important to get working
# before adding the next step, converting to a more specific type.
# You can take any millisecond output and add conversion functions as shown below.
# this one converts to a java.util.Time
randomDateWithinFeb2018: AddHashRange(0,2419200000L); StartingEpochMillis('2018-02-01 05:00:00'); ToDate();
# ToDate(...) supports a few argument forms that you can experiment with.
# ToDate(int) will space the dates apart by this many milliseconds.
# ToDate(int,int) will space the dates apart by some millis and also repeat the value for some number of cycles.
# Alternately, you might want to use a org.joda.time.DateTime instead of a java.util.Time:
randomJodaDateWithinFeb2018: AddHashRange(0,2419200000L); StartingEpochMillis('2018-02-01 05:00:00'); ToJodaDateTime();
# ToJodaDateTime(...) also supports the space and repeat forms as shown above for ToDate(...)
# You can also have the dates in order, but with some limited out-of-order pertubation.
# In this case, we are swizzling the offset by some pseudo-random amount, up to an hour (in millis)
randomDateWithinFeb2018Jittery: AddHashRange(0,3600000L); StartingEpochMillis('2018-02-01 05:00:00'); ToDate();
# If you want to have the result be a string-formatted date representation for testing, try this:
# You can use any formatter from here: http://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html
timeuuid_string: AddHashRange(0,2419200000L); StartingEpochMillis('2018-02-01 05:00:00'); StringDateWrapper("yyyy-MM-dd HH:mm:ss.SSS");
# ebdse bundles some specialized mapping functions in addition to those explained above, which
# come with eb. These are shown below.
# You can create a com.datastax.driver.core.LocalDate for use with the java driver.
# This takes as its input, the number of days since the unix epoch.
localdate: LongToLocalDateDays()
# You can also take the millis from any of the examples above which provide epoch millis,
# and convert the output to a millisecond-stable value, analogous to the CQL functions
# that do the same.
minUUID: AddHashRange(0,3600000); StartingEpochMillis('2018-02-01 05:00:00'); ToTimeUUIDMin();
maxUUID: AddHashRange(0,3600000); StartingEpochMillis('2018-02-01 05:00:00'); ToTimeUUIDMax();
# If you find useful recipes which are needed by others, please contribute them back to our examples!

View File

@ -0,0 +1,62 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/timeuuid.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{fullname}\n"
bindings:
# All uncommented lines under this are indented, so they become named bindings below
# the entry above
# Normally, the value that you get with a cycle starts at 0.
cycleNum: Identity();
# here we convert the cycle number to a TIMEUUID by casting.
id: Identity(); ToEpochTimeUUID()
## Client ID
client_id: AddHashRange(0L, 2000000000000L); ToEpochTimeUUID()
# Example output:
# client_id : 4eb369b0-91de-11bd-8000-000000000000
# client_id : 0b9edab0-5401-11e7-8000-000000000000
# client_id : 58f21c30-0eec-11f3-8000-000000000000
# client_id : 4f547e60-a48a-11ca-8000-000000000000
# client_id : 42db8510-cad8-11bb-8000-000000000000
# client_id : 78cc7790-529c-11d6-8000-000000000000
# client_id : 55382200-9cfd-11d7-8000-000000000000
# client_id : 1ebdbef0-b6dc-11b7-8000-000000000000
# client_id : 8bc58ba0-57fe-11da-8000-000000000000
# client_id : 03d1b690-ba64-11f5-8000-000000000000
# If you wanted a java.util.UUID instead of a java.util.Date type, you can use something like below.
# This form avoids setting the non-time fields in the timeuuid value. This makes testing determinstically
# possible, when the basic data type as used in practice, is designed specifically to avoid repeatability.
timeuuid1: AddHashRange(0,2419200000L); StartingEpochMillis('2018-02-01 05:00:00'); ToEpochTimeUUID();
# There is a shortcut for this version supported directly by ToEpochTimeUUID(..) as seen here:
timeuuid2: AddHashRange(0,2419200000L); ToEpochTimeUUID('2018-02-01 05:00:00');
# You can also access the finest level of resolution of the timeuuid type, where each cycle value represents
# the smallest possible change for a timeuuid. Bear in mind that this represents many many sub-millisecond
# level timestamp values which may not be easy to see in normal timestamp formats. In this case, millisecond
# semantics are not appropriate, so make sure you adjust the input values accordingly.
timeuuid_finest1: ToFinestTimeUUID();
# However, since starting at some reference time is a popular option, ToFinestTimeUUID(...) also supports
# the shortcut version just like ToEpochTimeUUID(). This is provided because converting between epoch
# millis and timeuuid ticks is not fun.
timeuuid_finest_relative: ToFinestTimeUUID('2018-02-01 05:00:00');

View File

@ -0,0 +1,39 @@
# You can run this file with this command line to see the values printed to stdout:
# ./ebdse run driver=stdout yaml=bindings/uuid.yaml cycles=10
# This file demonstrates different types of timestamp recipes
# that you can use with virtdata. (The bindings used in ebdse)
# If you want to control the output, uncomment and edit the statement template below
# and modify the named anchors to suit your output requirements.
#statements:
# example1: "{fullname}\n"
bindings:
# All uncommented lines under this are indented, so they become named bindings below
# the entry above
# Normally, the value that you get with a cycle starts at 0.
cycleNum: Identity();
# here we convert the cycle number to a UUID by casting.
id: Identity(); ToHashedUUID()
## Station ID (100 unique UUID values, can override stations on the command-line)
station_id: Mod(<<stations:100>>); ToHashedUUID()
# Example output:
# station_id : 28df63b7-cc57-43cb-9752-fae69d1653da
# station_id : 5752fae6-9d16-43da-b20f-557a1dd5c571
# station_id : 720f557a-1dd5-4571-afb2-0dd47d657943
# station_id : 6fb20dd4-7d65-4943-9967-459343efafdd
# station_id : 19674593-43ef-4fdd-bdf4-98b19568b584
# station_id : 3df498b1-9568-4584-96fd-76f6081da01a
# station_id : 56fd76f6-081d-401a-85eb-b1d9e5bba058
# station_id : 45ebb1d9-e5bb-4058-b75d-d51547d31952
# station_id : 375dd515-47d3-4952-a49d-236be9a5c070
# station_id : 249d236b-e9a5-4070-9afa-8fae9060d959

View File

@ -0,0 +1,54 @@
scenarios:
default:
schema: run driver=cql tags==phase:schema cycles==UNDEF threads==1
rampup: run driver=cql tags==phase:rampup cycles=TEMPLATE(rampup-cycles,100K) threads=auto
bindings:
userid: Template('user-{}',ToString()); SaveString('userid');
interest: Template('interest-{}',ToString());
blocks:
- name: schema
tags:
phase: schema
statements:
- create-keyspace: |
create KEYSPACE if not exists TEMPLATE(keyspace,examples)
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}
AND durable_writes = 'true';
- create-users-table: |
create table if not exists TEMPLATE(keyspace,examples).users (
userid text PRIMARY KEY
);
- create-interests-table: |
create table if not exists TEMPLATE(keyspace,examples).interests (
userid text,
interest text,
primary key (interest, userid)
);
- name: rampup
tags:
phase: rampup
statements:
- insert-users: |
insert into TEMPLATE(keyspace,examples).users (userid) VALUES ({userid});
tags:
entity: users
- insert-interests: |
insert into TEMPLATE(keyspace,examples).interests(
interest, userid
) VALUES (
{interest}, {userid}
);
tags:
entity: interests
- name: main
tags:
phase: main
statements:
- read-user: |
select * from TEMPLATE(keyspace,examples).users
where userid={userid};
- read interests: |
select * from TEMPLATE(keyspace,examples).interests
where interest={interest};

View File

@ -0,0 +1,3 @@
These docs are carried over from the prior cql 1.9 and cql 3.* drivers. They do not describe
current behavior, but are here as a reference point for closing the implementation gap
in the new cqld4 driver before it is moved from prerelease status to mainline releases.

View File

@ -0,0 +1,97 @@
# cql driver - advanced features
This is an addendum to the standard CQL Activity Type docs. For that,
see "cql". Use the features in this guide carefully. They do not come
with as much documentation as they are less used than the main CQL
features.
### ResultSet and Row operators
Within the CQL Activity type, synchronous mode (activities with out the
async= parameter), you have the ability to attach operators to a given
statement such that it will get per-statement handling. These operators
are ways of interrogating the result of an operation, saving values, or
managing other side-effects for specific types of testing.
When enabled for a statement, operators are applied in this order:
1. Activity-level ResultSet operators are applied in specified order.
2. Statement-level ResultSet operators are applied in specified order.
3. Activity-level Row operators are applied in specified order.
4. Statement-level Row operators are applied in specified order.
The result set handling does not go to any extra steps of making
a copy of the data. When a row is read from the result set,
it is consumed from it. Thus, if you want to do anything with
row data, you must apply a row operator as explained below.
### CQL Statement Parameters
- **rsoperators** - If provided as a CQL statement param, then the
list of operator names that follow, separated by a comma, will
be used to attach ResultSet operators to the given statement.
Such operators act on the whole result set of a statement.
- **rowoperators** - If provided as a CQL statement param, then the
list of operator names that follow, separated by a comma, will
be used to attache Row operators to the given statement.
## Available ResultSet Operators
- pushvars - Push a copy of the current thread local variables onto
the thread-local stack. This does nothing with the ResultSet data,
but is meant to be used for stateful management of these in
conjunction with the row operators below.
- popvars - Pop the last thread local variable set from the thread-local
stack into vars, replacing the previous content. This does nothing
with the ResultSet data.
- clearvars - Clears the contents of the thread local variables. This
does nothign with the ResultSet data.
- trace - Flags a statement to be traced on the server-side and then
logs the details of the trace to the trace log file.
- log - Logs basic data to the main log. This is useful to verify that
operators are loading and triggering as expected.
- assert_singlerow - Throws an exception (ResultSetVerificationException)
if the ResultSet has more or less than one row.
Examples:
```yaml
statements:
- s1: |
a statement
rsoperators: pushvars, clearvars
```
## Available Row Operators:
- savevars - Copies the values of the row into the thread-local variables.
- saverows - Copies the rows into a special CQL-only thread local row state.
Examples:
```yaml
statements:
- s2: |
a statement
rowoperators: saverows
```
## Injecting additional Queries (Future)
It is possible to inject new operations to an activity. However, such operations are _indirect_ to cycles, since they
must be based on the results of other operations. As such, they will not be represented in cycle output or other
advanced features. This is a specific feature for the CQL activity -- implemented internal to the way a CQL cycle is
processed. A future version of NoSQLBench will provide a more uniform way to achieve this result across activity types.
For now, remember that this is a CQL-only capability.
- subquery-statement - Adds additional operations to the current cycle, based
on the contents of the thread-local row state. The value to this parameter
is a name of a statement in the current YAML.
local thread based on contents
of the CQL-only thread local row state. Each row is consumed from this list,
and a new operation is added to the current cycle.
- subquery-concurrency - Allow subqueries to execute with concurrency, up to
the level specified.
default: 1

View File

@ -0,0 +1,201 @@
# cql error handling
The error handling facility utilizes a type-aware error handler
provided by nosqlbench. However, it is much more modular and configurable
than most error handlers found in other testing tools. The trade-off here
is that so many options may bewilder newer users. If you agree, then
simply use one of these basic recipes in your activity parameters:
# error and stop on *any exception
# incidentally, this is the same as the deprecated diagnose=true option
errors=stop
# error and stop for (usually) unrecoverable errors
# warn and retry everything else (this is actually the default)
errors=stop,retryable->retry
# record histograms for WriteTimeoutException, error and stop
# for everything else.
errors=stop,WriteTimeoutException:histogram
As you can see, the error handling format is pretty basic. Behind this basic
format is modular and flexible configuration scheme that should allow for either
simple or advanced testing setups. The errors value is simply a list of error to
hander verbs mappings, but also allows for a simple verb to be specified to
cover all error types. Going from left to right, each mapping is applied in
order. You can use any of ':', '->', or '=' for the error to verb assignment
operator.
Anytime you assign a value to the *errors* parameter for a cql activity, you are
replacing the default 'stop,retryable->retry,unverified->stop' configuration.
That is, each time this value is assigned, a new error handler is configured and
installed according to the new value.
### errors= parameter format
The errors parameter contains a comma-separated list of one or more
handler assignments where the error can be in any of these forms:
- group name ( "unapplied" | "retryable" | "unverified" )
- a single exception name like 'WriteTimeoutException', or a substring of
that which is long enough to avoid ambiguity (only one match allowed)
- A regex, like '.*WriteTimeout.*' (multiple matches allowed)
The verb can be any of the named starting points in the error handler
stack, as explained below.
As a special case, if the handler assignment consists of only a single word,
then it is assumed to be the default handler verb. This gets applied
as a last resort to any errors which do not match another handler by class
type or parent class type. This allows for simple hard wiring of a
handler default for all non-specific errors in the form:
# force the test to stop with any error, even retryable ones
errors=stop
### Error Handler Verbs
When an error occurs, you can control how it is handled for the most part.
This is the error handler stack:
- **stop** - logs an error, and then rethrows the causing exception,
causing nosqlbench to shutdown the current scenario.
- **warn** - log a warning in the log, with details about the error and
associated statement.
- **retry** - Retry the operation if the number of retries hasn't been
used up *and* the causing exception falls in the set of
*retryable* errors.
- **histogram** - keep a histogram of the exception counts, under the name
errorhistos.classname, using the simple class name. The magnitude of
these histos is how long the operation was pending before the related
error occurred.
- **count** - keep a count in metrics for the exception, under the name
errorcounts.classname, using the simple class name.
- **counter** - same as **count**, added for compatibility with the newer
universal error handler. This one is the preferred name.
- **ignore** - do nothing, do not even retry or count
Each handling verb above is ordered from the most invasive to least
invasive starting at the top. With the exception of the **stop**
handler, the rest of them will be applied to an error all the way to the
bottom. For now, the error handling stack is exactly as above. You can't
modify it, although it may be made configurable in the future.
One way to choose the right handler is to say "How serious is this type of
error to the test results if it happens?" In general, it is best to be
more conservative and choose a more aggressive setting unless you are
specifically wanting to measure how often a given error happens, for
example.
Each exception type will have one and only one error handler at all times.
No matter how you set an error handler for a class, only the most recently
assigned handler stack will be active for it. This might be important to
keep in mind when you make multiple assignments to potentially overlapping
sets of error types. In any case, the default 'stop' handler will always
catch an error that does not otherwise have a more specific handler
assigned to it.
##### Error Types
The errors that can be handled are simply all the exception types that can
be thrown by either the DataStax Java Driver for DSE, *or* the nosqlbench
client itself. This includes errors that indicate a potentially
intermittent failure condition. It also includes errors that are more
permanent in nature, like WriteFailure, which would continue to occur on
subsequent retries without some form of intervention. The nosqlbench
application will also generate some additional exceptions that capture
common error cases that the Java driver doesn't or shouldn't have a
special case for, but which may be important for nosqlbench testing
purposes.
In nosqlbench, all error handlers are specific to a particular kind of
exception that you would catch in a typical application that uses DSE,
although you can tell a handler to take care of a whole category of
problems as long as you know the right name to use.
##### Assigned by Java Exception Type
Error handlers can be assigned to a common parent type in order to also handle
all known subtypes, hence the default on the top line applies to all of the
driver exceptions that do not have a more specific handler assigned, either
by a closer parent or directly.
##### Assigning by Error Group Name
Error types for which you would commonly assign the same handling behavior
are also grouped in predefined names. If a handler is assigned to one of
the group names, then the handler is assigned all of the exceptions in the
group individually. For example, 'errors=retryable=stop'
### Recognized Exceptions
The whole hierarchy of exceptions as of DSE Driver 3.2.0 is as follows,
with the default configuration shown.
DriverException -> stop
FrameTooLongException
CodecNotFoundException
AuthenticationException
TraceRetrievalException
UnsupportedProtocolVersionException
NoHostAvailableException -> retry (group: retryable)
QueryValidationException (abstract)
InvalidQueryException
InvalidConfigurationInQueryException
UnauthorizedException
SyntaxError
AlreadyExistsException
UnpreparedException
InvalidTypeException
QueryExecutionException (abstract)
UnavailableException
BootstrappingException -> retry (group: retryable)
OverloadedException -> retry (group: retryable)
TruncateException
QueryConsistencyException (abstract)
WriteTimeoutException -> retry (group: retryable)
WriteFailureException -> retry (group: retryable)
ReadFailureException
ReadTimeoutException
FunctionExecutionException
DriverInternalError
ProtocolError
ServerError
BusyPoolException
ConnectionException
TransportException
OperationTimedOutException -> retry (group: retryable)
PagingStateException
UnresolvedUserTypeException
UnsupportedFeatureException
BusyConnectionException
EbdseException (abstract) -> stop
CQLResultSetException (abstract)
UnexpectedPagingException
ResultSetVerificationException
RowVerificationException
ChangeUnappliedCycleException (group:unapplied)
RetriesExhaustedCycleException -> count
##### Additional Exceptions
The following exceptions are synthesized directly by nosqlbench, but get
handled alongside the normal exceptions as explained above.
1. ChangeUnappliedException - The change unapplied condition is important to
detect when it is not expected, although some testing may intentionally send
changes that can't be applied. For this reason, it is kept as a separately
controllable error group "unapplied".
2. UnexpectedPaging - The UnexpectedPaging exception is meant to keep users from
being surprised when there is paging activity in the workload, as this can have
other implications for tuning and performance. See the details on the
**maxpages** parameter, and the *fetch size* parameter in the java
driver for details.
3. Unverified\* Exceptions - For data set verification; These exceptions
indicate when a cqlverify activity has found rows that differ from what
was expected.
4. RetriesExhaustedException - Indicates that all retries were exhausted before
a given operation could complete successfully.

View File

@ -0,0 +1,42 @@
DriverException -> stop
1 FrameTooLongException
2 CodecNotFoundException
3 AuthenticationException
4 TraceRetrievalException
5 UnsupportedProtocolVersionException
6 NoHostAvailableException
7 QueryValidationException (abstract)
8 InvalidQueryException
9 InvalidConfigurationInQueryException
10 UnauthorizedException
11 SyntaxError
12 AlreadyExistsException
13 UnpreparedException
14 InvalidTypeException
15 QueryExecutionException (abstract) -> retry
16 UnavailableException
17 BootstrappingException
18 OverloadedException
19 TruncateException
20 QueryConsistencyException (abstract)
21 WriteTimeoutException
22 WriteFailureException
23 ReadFailureException
24 ReadTimeoutException
25 FunctionExecutionException
26 DriverInternalError
27 ProtocolError
28 ServerError
29 BusyPoolException
30 ConnectionException
31 TransportException
32 OperationTimedOutException
33 PagingStateException
34 UnresolvedUserTypeException
35 UnsupportedFeatureException
36 BusyConnectionException
41 EbdseCycleException (abstract) -> stop
37 ChangeUnappliedCycleException
38 ResultSetVerificationException
39 RowVerificationException (abstract)
40 UnexpectedPagingException

View File

@ -0,0 +1,83 @@
# CQL Load Balancing Options
WIth the CQL driver, you may configure the load balancing with the same options you might use in
client code. However, they are expressed here in a command-line friendly form.
## Combining Policies
To apply these load balancer policies, set the activity parameter `lbp` with a comma-separated list
of policies from the examples below.
They are build as a nested set of polices, with the semantics of "and then". For example, the
TokenAwarePolicy followed by the LatencyAwarePolicy looks like `TAP(...),LAP(...)` which means
`TokenAwarePolicy(...)` and then `LatencyAwarePolicy(...)`. This is equivalent to Java code which
first constructs a LatencyAwarePolicy and then wraps it with a TokenAwarePolicy. This follows the
notion that the outer-most policy has primary control over options presented to child policies, and
thus you can think of the routing process as "TokenAwarePolicy decides ... " *and then* with what it
shares with the wrapped child policy, "LatencyAwarePolicy decides...", and so on.
Even though you can use the simple pollicy descriptions above, they are constructed in the same
programmatic way in Java that you would use to nest them in the specified order.
For example, a token aware policy wrapping a white list policy might look like this on your command
line:
lbp=TAP(),WLP(127.0.0.1)
## Supported Load Balancer Policies
Each supported policy is described in detail below, with the options supported.
### RRP: Round Robin Policy
Format: `RRP()`
**note** You can't wrap another policy with RRP.
### WLP: White List Policy
Format: `WLP(addr,...)`
### TAP: Token Aware Policy
Format: `TAP()`
### LAP: Latency Aware Policy
This policy has many optional parameters, so if you use it you must set them by name.
Format: `LAP(options...)`, where each option is one of the following:
- `exclusion_threshold` (or `et`) - The exclusion threshold, or how much worse a node has to be to
be excluded for awhile. Javadoc: The default exclusion threshold (if this method is not called) is
`2`. In other words, the resulting policy excludes nodes that are more than twice slower than the
fastest node.
- `minimum_measurements` (or `mm`) - The minimum number of measurements to take before penalizing a
host. Javadoc: The default for this option (if this method is not called) is `50`. Note that it is
probably not a good idea to put this option too low if only to avoid the influence of JVM warm-up
on newly restarted nodes.
- `retry_period` (or `rp`) - The retry period, in seconds. Javadoc: The retry period defines how
long a node may be penalized by the policy before it is given a 2nd chance. This is 10 seconds by
default.
- `retry_period_ms` (or `rp_ms`) - The retry period, in milliseconds. This is the same as above, but
allows you to have more precise control if needed.
- `scale` (or `s`) - The scale parameter adjusts how abruptly the most recent measurements are
scaled down in the moving average over time. 100ms is the default. Higher values reduce the
significance of more recent measurements, lower values increase it. The default is 100ms.
- `scale_ms` - The scale parameter, in milliseconds. This is the same as above, but allows you to
have more prcise control if needed.
- `update_rate` (or `ur`) - How often a node's latency average is computed. The default is 1/10
second.
- `update_rate_ms` (or `ur_ms`) - The update rate, in milliseconds.
Examples:
- `lbp="LAP(mm=10,rp_ms=10000)"`
- `lbp="LatencyAwarePolicy(minimum_measurements=10,retry_period_ms=10000)"`
### DCARRP: DC-Aware Round Robin Policy
Format: `DCARRP(localdc=somedcname)`
This load balancing policy does not expose other non-deprecated options in the bundled version of
the driver, and the datacenter name is required.

View File

@ -0,0 +1,419 @@
# cql driver
This is a driver which allows for the execution of CQL statements. This driver supports both sync and async modes, with
detailed metrics provided for both.
### Example activity definitions
Run a cql activity named 'cql1', with definitions from activities/cqldefs.yaml
... driver=cql alias=cql1 workload=cqldefs
Run a cql activity defined by cqldefs.yaml, but with shortcut naming
... driver=cql workload=cqldefs
Only run statement groups which match a tag regex
... driver=cql workload=cqldefs tags=group:'ddl.*'
Run the matching 'dml' statements, with 100 cycles, from [1000..1100)
... driver=cql workload=cqldefs tags=group:'dml.*' cycles=1000..1100
This last example shows that the cycle range is [inclusive..exclusive),
to allow for stacking test intervals. This is standard across all
activity types.
### CQL ActivityType Parameters
- **cqldriver** - default: dse - The type of driver to use, either dse, or oss. If you need DSE-specific features, use
the dse driver. If you are connecting to an OSS Apache Cassandra cluster, you must use the oss driver. The oss driver
option is only available in nosqlbench.
- **host** - The host or hosts to use for connection points to
the cluster. If you specify multiple values here, use commas
with no spaces.
Examples:
- `host=192.168.1.25`
- `host=192.168.1.25,testhost42`
- **workload** - The workload definition which holds the schema and statement defs.
see workload yaml location for additional details
(no default, required)
- **port** - The port to connect with
- **cl** - An override to consistency levels for the activity. If
this option is used, then all consistency levels will be replaced
by this one for the current activity, and a log line explaining
the difference with respect to the yaml will be emitted.
This is not a dynamic parameter. It will only be applied at
activity start.
- **cbopts** - default: none - this is how you customize the cluster
settings for the client, including policies, compression, etc. This
is a string of *Java*-like method calls just as you would use them
in the Cluster.Builder fluent API. They are evaluated inline
with the default Cluster.Builder options not covered below.
Example: cbopts=".withCompression(ProtocolOptions.Compression.NONE)"
- **whitelist** default: none - Applies a whitelist policy to the load balancing
policy in the driver. If used, a WhitelistPolicy(RoundRobinPolicy())
will be created and added to the cluster builder on startup.
Examples:
- `whitelist=127.0.0.1`
- `whitelist=127.0.0.1:9042,127.0.0.2:1234`
- **retrypolicy** default: none - Applies a retry policy in the driver
The only option supported for this version is `retrypolicy=logging`,
which uses the default retry policy, but with logging added.
- **reconnectpolicy** default: none - Applies a reconnection policy in the
driver Supports
either `reconnectpolicy=exponential(minDelayInMs,maxDelayInMs)`
or `reconnectpolicy=constant(delayInMs)`. The driver reconnects using
this policy when the entire cluster becomes unavailable.
- **protocol_version** default: unset, defaults to driver default behavior
- Set the CQL protocol version. Valid values are V1, V2, V3, V4, V5,
DSE_V1, DSE_V2. Protocol is usually auto-negotiated, however, the
initial connection may use a lower protocol to ensure connectivity to
older server versions. If you know you are running on a newer server
version, you can set this to match.
- **pooling** default: none - Applies the connection pooling options to
the policy. Examples:
- `pooling=4:10`
keep between 4 and 10 connections to LOCAL hosts
- `pooling=4:10,2:5`
keep 4-10 connections to LOCAL hosts and 2-5 to REMOTE
- `pooling=4:10:2000`
keep between 4-10 connections to LOCAL hosts with
up to 2000 requests per connection
- `pooling=5:10:2000,2:4:1000` keep between 5-10 connections to
LOCAL hosts with up to 2000 requests per connection, and 2-4
connection to REMOTE hosts with up to 1000 requests per connection
Additionally, you may provide the following options on pooling. Any
of these that are provided must appear in this order:
`,heartbeat_interval_s:n,idle_timeout_s:n,pool_timeout_ms:n`, so a
full example with all options set would appear as:
`pooling=5:10:2000,2:4:1000,heartbeat_interval_s:30,idle_timeout_s:120,pool_timeout_ms:5`
- **socketoptions** default: none - Applies any of the valid socket
options to the client when the session is built. Each of the options
uses the long form of the name, with either a numeric or boolean
value. Individual sub-parameters should be separated by a comma, and
the parameter names and values can be separated by either equals or a
colon. All of these values may be changed:
- read_timeout_ms
- connect_timeout_ms
- keep_alive
- reuse_address
- so_linger
- tcp_no_delay
- receive_buffer_size
- send_buffer_size
Examples:
- `socketoptions=read_timeout_ms=23423,connect_timeout_ms=4444`
- `socketoptions=tcp_no_delay=true`
- **tokens** default: unset - Only executes statements that fall within
any of the specified token ranges. Others are counted in metrics
as skipped-tokens, with a histogram value of the cycle number.
Examples:
- tokens=1:10000,100000:1000000
- tokens=1:123456
- **maxtries** - default: 10 - how many times an operation may be
attempted before it is disregarded
- **maxpages** - default: 1 - how many pages can be read from a query which
is larger than the fetchsize. If more than this number of pages
is required for such a query, then an UnexpectedPaging excpetion
is passed to the error handler as explained below.
- **fetchsize** - controls the driver parameter of the same name.
Suffixed units can be used here, such as "50K". If this parameter
is not present, then the driver option is not set.
- **cycles** - standard, however the cql activity type will default
this to however many statements are included in the current
activity, after tag filtering, etc.
- **username** - the user to authenticate as. This option requires
that one of **password** or **passfile** also be defined.
- **password** - the password to authenticate with. This will be
ignored if passfile is also present.
- **passfile** - the file to read the password from. The first
line of this file is used as the password.
- **ssl** - specifies the type of the SSL implementation.
Disabled by default, possible values are `jdk` and `openssl`.
[Additional parameters may need to be provided](ssl.md).
- **jmxreporting** - enable JMX reporting if needed.
Examples:
- `jmxreporting=true`
- `jmxreporting=false` (the default)
- **alias** - this is a standard nosqlbench parameter, however the cql type will use the workload value also as the
alias value when not specified.
- **errors** - error handler configuration.
(default errors=stop,retryable->retry,unverified->stop)
Examples:
- errors=stop,WriteTimeoutException=histogram
- errors=count
- errors=warn,retryable=count
See the separate help on 'cqlerrors' for detailed
configuration options.
- **defaultidempotence** - sets default idempotence on the
driver options, but only if it has a value.
(default unset, valid values: true or false)
- **speculative** - sets the speculative retry policy on the cluster.
(default unset)
This can be in one of the following forms:
- pT:E:L - where :L is optional and
T is a floating point threshold between 0.0 and 100.0 and
E is an allowed number of concurrent speculative executions and
L is the maximum latency tracked in the tracker instance
(L defaults to 15000 when left out)
Examples:
- p99.8:5:15000ms - 99.8 percentile, 5 executions, 15000ms max tracked
- p98:2:10000ms - 98.0 percentile, 2 executions allowed, 10s max tracked
- Tms:E - where :E is optional and
T is a constant threshold latency and
E is the allowed number of concurrent speculative retries
(E default to 5 when left out)
Examples:
- 100ms:5 - constant threshold of 100ms and 5 allowed executions
- **seq** - selects the statement sequencer used with statement ratios.
(default: bucket)
(options: concat | bucket | interval)
The concat sequencer repeats each statement in order until the ratio
is achieved.
The bucket sequencer uses simple round-robin distribution to plan
statement ratios, a simple but unbalanced form of interleaving.
The interval sequencer apportions statements over time and then by
order of appearance for ties. This has the effect of interleaving
statements from an activity more evenly, but is less obvious in how
it works.
All of the sequencers create deterministic schedules which use an internal
lookup table for indexing into a list of possible statements.
- **trace** - enables a trace on a subset of operations. This is disabled
by default.
Examples:
`trace=modulo:100,filename:trace.log`
The above traces every 100th cycle to a file named trace.log.
`trace=modulo:1000,filename:stdout`
The above traces every 1000th cycle to stdout.
If the trace log is not specified, then 'tracelog' is assumed.
If the filename is specified as stdout, then traces are dumped to stdout.
- **sessionid** - names the configuration to be used for this activity. Within a given scenario, any activities that use
the same name for clusterid will share a session and cluster. default: 'default'
- **drivermetrics** - enable reporting of driver metrics.
default: false
- **driverprefix** - set the metrics name that will prefix all CQL driver metrics.
default: 'driver.*clusterid*.'
The clusterid specified is included so that separate cluster and session
contexts can be reported independently for advanced tests.
- **usercodecs** - enable the loading of user codec libraries for more
details see: com.datastax.codecs.framework.UDTCodecInjector in the
nosqlbench code base. This is for dynamic codec loading with
user-provided codecs mapped via the internal UDT APIs. default: false
- **secureconnectbundle** - used to connect to CaaS, accepts a path to the
secure connect bundle that is downloaded from the CaaS UI. Examples:
- `secureconnectbundle=/tmp/secure-connect-my_db.zip`
- `secureconnectbundle="/home/automaton/secure-connect-my_db.zip"`
Check
out [Astra Documentation](https://docs.astra.datastax.com/docs/test-loading-data-with-nosqlbench)
for samples
- **insights** - Set to false to disable the driver from sending insights
monitoring information
- `insights=false`
- **tickduration** - sets the tickDuration (milliseconds) of
HashedWheelTimer of the java driver. This timer is used to schedule
speculative requests. Examples:
- `tickduration=10`
- `tickduration=100` (driver default value)
- **compression** - sets the transport compression to use for this
activity. Valid values are 'LZ4' and 'SNAPPY'. Both types are bundled
with EBDSE.
- **showcql** - logs cql statements as INFO (to see INFO messages in stdout use -v or greater) Note: this is expensive
and should only be done to troubleshoot workloads. Do not use `showcql` for your tests.
- **lbp** - configures the load balancing policies for the Java driver. With this parameter, you can
configure nested load balancing policies in short-hand form.
The policies available are documented in detail under the help topic `cql-loadbalancing`. See that
guide if you need more than the examples below.
Examples:
- `lbp=LAP(retry_period=3,scale=10)` - Latency aware policy with retry period of 3 seconds.
(Seconds is the default time unit, unless _ms parameter is used) and scale 10.
- `lbp=LAP(rp=3,s=10)` - Same as above, using the equivalent but terser form.
- `lbp=LAP(rp_ms=3000,s_ms=10000)` - Same as above, with milliseconds instead of
seconds.
- `loadbalancing=LAP(s=10),TAP()` - Latency aware policy, followed by
token aware policy.
### CQL YAML Parameters
A uniform YAML configuration format was introduced with engineblock 2.0.
As part of this format, statement parameters were added for the CQL Activity Type.
These parameters will be consolidated with the above parameters in time, but for
now **they are limited to a YAML params block**:
params:
ratio: 1
# Sets the statement ratio within the operation sequencer
# scheme. Integers only.
# When preparing the operation order (AKA sequencing),
# frequency of the associated statements.
cl: ONE
# Sets the consistency level, using any of the standard
# identifiers from com.datastax.driver.core.ConsistencyLevel,
# any one of:
# LOCAL_QUORUM, ANY, ONE, TWO, THREE, QUORUM, ALL,
# EACH_QUORUM, SERIAL, LOCAL_SERIAL, LOCAL_ONE
prepared: true
# By default, all statements are prepared. If you are
# creating schema, set this to false.
idempotent: false
# For statements that are known to be idempotent, set this
# to true
instrument: false
# If a statement has instrument set to true, then
# individual Timer metrics will be tracked for
# that statement for both successes and errors,
# using the given statement name.
verify: *
compare: all
# Adds two operators to the operation:
# 1) verify that there is a single row result set in the response.
# 2) verify some or all of the field values by name and/or value.
# If this option is used on any statement, then the activity will
# provide verification metrics and exceptions, including details
# of verification in the log once the activity is completed.
# For full details on this field, see the docs on cqlverify.
/// Cross-verify all fields and field values between the reference data and
/// the actual data.
all(0x1|0x1<<1|0x1<<2);
logresultcsv: true
OR
logresultcsv: myfilename.csv
# If a statement has logresultcsv set to true,
# then individual operations will be logged to a CSV file.
# In this case the CSV file will be named as
# <statement-name>--results.csv.
# If the value is present and not "true", then the value will
# be used as the name of the file.
#
# The format of the file is:
# <cycle>,(SUCCESS|FAILURE),<nanos>,<rows-fetched>,(<error-class,NONE)
# NOTES:
# 1) BE CAREFUL with this setting. A single logged line per
# result is not useful for high-speed testing as it will
# impose IO loads on the client to slow it down.
# 2) BE CAREFUL with the name. It is best to just pick good
# names for your statement defs so that everything remains
# coherent and nothing gets accidentally overwritten.
# 3) If logresultcsv is provided at the activity level, it
# applies to all statements, and the only value values
# there are true and false.
start-timers: timername1, timername2, ...
#
# If a statement has start-timers value set, then the named
# timers are started in the local thread before the
# statement is executed
#
# Together, with the stop-timers modifier, you can measure
# sequences of statements with specific named boundaries.
#
# The name of the timer is qualified with the activity alias
# just as all other metric names.
#
# This is generally only useful when the async= parameter is
# NOT used, since the scope of the timer is thread-local. When
# async is used, many operations may overlap each other in the
# same thread, breaking linearization guarantees which make
# thread local scoping helpful for tracking linearized operations.
#
# When a timer is started, a timer context is created and stored
# under this name in the thread. You must ensure that an
# associated stop-timers setting is applied to another statement
# in order to trigger the tally of these metrics.
stop-timers: timername1, timername2, ...
#
# If a statement has a stop-timers value set, then after the
# statement is finished, whether by error or by successful
# completion, the named timers are stopped and the resulting
# measurement is added to metrics.
#
# If you add stop-timers with names that do not have a matching
# start-timers name, or vice-versa then an error is thrown.
### Metrics
- alias.result - A timer which tracks the performance of an op result
only. This is the async get on the future, broken out as a separate
step.
- alias.result-success - A timer that records rate and histograms of the
time it takes from submitting a query to completely reading the result
set that it returns, across all pages. This metric is only counted for
non-exceptional results, while the result metric above includes
all operations.
- alias.bind - A timer which tracks the performance of the statement
binding logic, including the generation of data immediately prior
- alias.execute - A timer which tracks the performance of op submission
only. This is the async execution call, broken out as a separate step.
- alias.tries - A histogram of how many tries were required to get a
completed operation
- alias.pages - A timer which tracks the performance of paging, specific
to more than 1-page query results. i.e., if all reads return within 1
page, this metric will not have any data.
- alias.strides - A timer around each stride of operations within a thread
- alias.skipped-tokens - A histogram that records the count and cycle values
of skipped tokens.
## YAML Examples
Please see the bundled activities with nosqlbench for examples.

View File

@ -0,0 +1,242 @@
# dsegraph activity type
# warning; These docs are a work in progress
This is an activity type which allows for the execution of workloads
using DSE Graph and the DSE Java Driver.
This activity type is wired synchronously within each client
thread, however the async API is used in order to expose fine-grain
metrics about op binding, op submission, and waiting for a result.
## Example activity definitions
Run a dsegraph activity named 'a1', with definitions from activities/graphs.yaml
~~~
... type=dsegraph alias=a1 yaml=graphs
~~~
Run a dsegraph activity defined by graphs.yaml, but with shortcut naming
~~~
... type=dsegraph yaml=graphs
~~~
Only run statement groups which match a tag regex
~~~
... type=dsegraph yaml=graphs tags=group:'ddl.*'
~~~
Run the matching 'dml' statements, with 100 cycles, from [1000..1100)
~~~
... type=dsegraph yaml=graphs tags=group:'dml.*' cycles=1000..11000
~~~
This last example shows that the cycle range is [inclusive..exclusive),
to allow for stacking test intervals. This is standard across all
activity types.
## dsegraph ActivityType Parameters
- **yaml** - The file which holds the schema and statement defs.
(no default, required)
~~~
DOCS TBD FOR THIS SECTION
- **cl** - An override to consistency levels for the activity. If
this option is used, then all consistency levels will be replaced
by this one for the current activity, and a log line explaining
the difference with respect to the yaml will be emitted.
This is not a dynamic parameter. It will only be applied at
activity start.
~~~~
- **cbopts** - this is how you customize the cluster settings for
the client, including policies, compression, etc. This is
a string of *Java*-like method calls just as you would use them
in the Cluster.Builder fluent API. They are evaluated inline
with the default Cluster.Builder options not covered below.
Example: cbopts=".withCompression(ProtocolOptions.Compression.NONE)"
- **maxtries** - how many times an operation may be attempted
~~~
DOCS TBD FOR THIS SECTION
- **diagnose** - if this is set to true, then any exception for an
operation are thrown instead of handled internally. This can
be useful for diagnosing exceptions during scenario development.
In this version of ebdse, this is a shortcut for setting all the
exception handlers to **stop**.
~~~
- **cycles** - standard, however the cql activity type will default
this to however many statements are included in the current
activity, after tag filtering, etc.
- **username** - the user to authenticate as. This option requires
that one of **password** or **passfile** also be defined.
- **password** - the password to authenticate with. This will be
ignored if passfile is also present.
- **passfile** - the file to read the password from. The first
line of this file is used as the password.
- **alias** - this is a standard engineblock parameter, however
the cql type will use the yaml value also as the alias value
when not specified.
- **graphson** - the version of the graphson protocol to use:
default: 2
## Statement Parameters
- **repeat** - if specified, causes the statement blocks to be
lexically repeated before being evaluated as statements,
including enumerated bindings.
## Error Handling
#### Error Handlers
When an error occurs, you can control how it is handled.
This is the error handler stack:
- **stop** - causes the exception to be thrown to the runtime, forcing a shutdown.
- **warn** - log a warning in the log, with details about the error and associated statement.
- **count** - keep a count in metrics for the exception, under the name
exceptions.classname, using the simple class name, of course.
- **retry** - Retry the operation if the number of retries hasn't been
used up.
- **ignore** - do nothing, do not even retry or count
They are ordered from the most extreme to the most oblivious starting
at the top. With the exception of the **stop** handler, the rest of
them will be applied to an error all the way to the bottom. One way
to choose the right handler is to say "How serious is this to the test
run or the results of the test if it happens?" In general, it is best
to be more conservative and choose a more aggressive setting unless you
are specifically wanting to measure how often a given error happens,
for example.
#### Error Types
The errors that can be detected are sorted into three categories:
~~~
DOCS TBD FOR THIS SECTION
- **unapplied** - This was a LWT that did not get applied. All operations
are checked, and a ChangeUnapplied exception is thrown.
(This is a local exception to make error handling consistent)
This is a separate category from retryable, because you have to
have reactive logic to properly submit a valid request when it occurs.
~~~
- **retryable** - NoHostAvailable, Overloaded, WriteTimeout, and
ReadTimeout exceptions. These are all exceptions that might
succeed if tried again with the same payload.
- **realerrors** - ReadFailure, WriteFailure, SyntaxError, InvalidQuery.
These represent errors that are likely a persistent issue, and
will not likely succeed if tried again.
To set the error handling behavior, simply pair these categories up with
an entry point in the error handler stack. Here is an example, showing
also the defaults that are used if you do not specify otherwise:
retryable=retry realerror=stop
## Generic Parameters
*provided by the runtime*
- **targetrate** - The target rate in ops/s
- **linkinput** - if the name of another activity is specified, this activity
will only go as fast as that one.
- **tags** - optional filter for matching tags in yaml sections (detailed help
link needed)
- **threads** - the number of client threads driving this activity
## Metrics
- \<alias\>.cycles - (provided by core input) A timer around the whole cycle
- \<alias\>.bind - A timer which tracks the performance of the statement
binding logic, including the generation of data immediately prior
- \<alias\>.execute - A timer which tracks the performance of op submission
only. This is the async execution call, broken out as a separate step.
- \<alias\>.result - A timer which tracks the performance of an op result only.
This is the async get on the future, broken out as a separate step.
- \<alias\>.tries - A histogram of how many tries were required to get a
completed operation
## YAML Format
The YAML file for a DSE Graph activity has one or more logical yaml documents,
each separted by tree dashes: --- the standard yaml document separator. Each
yaml document may contain a tags section for the purpose of including or
excluding statements for a given activity:
~~~ (optional)
tags:
tagname: value
...
~~~
If no tags are provided in a document section, then it will be matched by
all possible tag filters. Conversely, if no tag filter is applied in
the activity definition, all tagged documents will match.
Statements can be specified at the top level or within named blocks. When
you have simple needs to just put a few statements into the yaml, the top-level
style will suffice:
~~~
name: statement-top-level-example
statements:
- statement 1
- statement 2
~~~
If you need to represent multiple blocks of statements in the same activity,
you might want to group them into blocks:
~~~
blocks:
- name: statement-block-1
statements:
- statement 1
- statement 2
~~~
At any level that you can specify statements, you can also specify data bindings:
~~~
statements:
- statement 1
- statement 2
bindings:
bindto1: foo
bindto2: bar
blocks:
- name: statement-block-1
statements:
- statement 1
bindings:
bindto1: foo
~~~
Data bindings specify how values are generated to plug into each operation. More
details on data bindings are available in the activity usage guide.
### Parameter Templating
Double angle brackets may be used to drop parameters into the YAML
arbitrarily. When the YAML file is loaded, and only then, these parameters
are interpolated from activity parameters like those above. This allows you
to create activity templates that can be customized simply by providing
additional parameters to the activity. There are two forms,
\<\<some_var_name:default_value\>\> and \<\<some_var_name\>\>. The first
form contains a default value. In any case, if one of these parameters is
encountered and a qualifying value is not found, an error will be thrown.
### YAML Location
The YAML file referenced in the yaml= parameter will be searched for in the following places, in this order:
1. A URL, if it starts with 'http:' or 'https:'
2. The local filesystem, if it exists there
3. The internal classpath and assets in the ebdse jar.
The '.yaml' suffix is not required in the yaml= parameter, however it is
required on the actual file. As well, the logical search path "activities/"
will be used if necessary to locate the file, both on the filesystem and in
the classpath.
This is a basic example below that can be copied as a starting template.
## YAML Example
---
CONTENT TBD

View File

@ -0,0 +1,56 @@
# SSL
Supported options:
- **ssl** - specifies the type of the SSL implementation.
Disabled by default, possible values are `jdk`, and `openssl`.
- **tlsversion** - specify the TLS version to use for SSL.
Examples:
- `tlsversion=TLSv1.2` (the default)
For `jdk` type, the following options are available:
- **truststore** - specify the path to the SSL truststore.
Examples:
- `truststore=file.truststore`
- **tspass** - specify the password for the SSL truststore.
Examples:
- `tspass=truststore_pass`
- **keystore** - specify the path to the SSL keystore.
Examples:
- `keystore=file.keystore`
- **kspass** - specify the password for the SSL keystore.
Examples:
- `kspass=keystore_pass`
- **keyPassword** - specify the password for the key.
Examples:
- `keyPassword=password`
For `openssl` type, the following options are available:
- **caCertFilePath** - path to the X509 CA certificate file.
Examples:
- `caCertFilePath=cacert.crt`
- **certFilePath** - path to the X509 certificate file.
Examples:
- `certFilePath=ca.pem`
- **keyFilePath** - path to the OpenSSL key file.
Examples:
- `keyFilePath=file.key`

View File

@ -0,0 +1,4 @@
# cql help topics
- cql
- cql-errors
- cql-exception-list

View File

@ -1,12 +1,17 @@
description: creates local graphs which resemble a wagon-wheel topology
description: creates local graphs which resemble a wagon-wheel topology, using
DSE Graph, version 6.8 or newer
scenarios:
creategraph: run driver=cqld4 graphname=graph_wheels tags=phase:create-graph cycles===UNDEF
schema: run driver=cqld4 graphname=graph_wheels tags=phase:graph-schema cycles===UNDEF
disable-verify: run driver=cqld4 graphname=graph_wheels tags=phase:disable-verify cycles===UNDEF
rampup: run driver==cqld4 graphname=graph_wheels tags=phase:rampup cycles=1000
default:
creategraph: run driver=dsegraph graphname=graph_wheels tags=phase:graph-schema
schema: run driver=dsegraph graphname=graph_wheels tags=phase:graph-schema
main: run driver==dsegraph graphname=graph_wheels tags=name:main-add cycles=100000
devmode: run driver=dsegraph graphname=graph_wheels tags=name:dev-mode
prodmode: run driver=dsegraph graphname=graph_wheels tags=name:dev-mode
creategraph: run driver=cqld4 graphname=graph_wheels tags=phase:create-graph cycles===UNDEF
schema: run driver=cqld4 graphname=graph_wheels tags=phase:graph-schema cycles===UNDEF
rampup: run driver==cqld4 graphname=graph_wheels tags=phase:rampup cycles=1
devmode: run driver=cqld4 graphname=graph_wheels tags=name:dev-mode
prodmode: run driver=cqld4 graphname=graph_wheels tags=name:dev-mode
bindings:
sessionid: ToEpochTimeUUID()->java.util.UUID; ToString();
@ -16,54 +21,86 @@ bindings:
osversion: WeightedStrings('nougat:3;oreo:1;jellybean:2;4:1;4c:1;5:1;5c:1;trusty:1;xenial:1;yosemite:1;el capitan:2;sierra:3;high sierra:1;7:1;10:2')
ipaddress: Combinations('1;7;0-3;.;0-2;0-2;0-5;.;0-2;0-2;0-5')
createdtime: Add(1505256898)
diag_ten_pct: WeightedLongs('1:1;0:9')
diag_one_pct: WeightedLongs('1:1;0:99')
blocks:
- name: create-graph
create-graph:
tags:
phase: create-graph
statements:
- creategraph: >-
system.graph('<<graphname:graph_wheels>>').ifNotExists().create()
- name: create-schema
creategraph:
type: gremlin
script: >-
system.graph('<<graphname:graph_wheels>>').ifNotExists().create()
create-schema:
tags:
phase: graph-schema
statements:
- graph-schema: >-
schema.propertyKey('sessionid').Uuid().ifNotExists().create();
schema.propertyKey('deviceid').Uuid().ifNotExists().create();
schema.propertyKey('ipaddress').Text().ifNotExists().create();
schema.propertyKey('createdtime').Bigint().ifNotExists().create();
schema.vertexLabel('session').partitionKey('sessionid').properties('ipaddress', 'deviceid', 'createdtime').ifNotExists().create();
schema.propertyKey('type').Text().ifNotExists().create();
schema.propertyKey('os').Text().ifNotExists().create();
schema.propertyKey('osversion').Text().ifNotExists().create();
schema.vertexLabel('device').partitionKey('deviceid').properties('type', 'os', 'osversion').ifNotExists().create();
schema.edgeLabel('using').single().connection('session','device').ifNotExists().create();
tags:
name: graph-schema
- name: dev-mode
graph-schema:
type: gremlin
graphname: <<graphname:graph_wheels>>
script: >-
schema.vertexLabel('session')
.ifNotExists()
.partitionBy('sessionid', Uuid)
.property('ipaddress', Text)
.property('deviceid', Uuid)
.property('createdtime', Bigint)
.create();
schema.vertexLabel('device')
.ifNotExists()
.partitionBy('deviceid', Uuid)
.property('type', Text)
.property('os', Text)
.property('osversion', Text)
.create();
schema.edgeLabel('using')
.ifNotExists()
.from('session')
.to('device')
.create()
dev-mode:
tags:
phase: dev-mode
statements:
- dev-mode: >-
schema.config().option('graph.schema_mode').set('Development');
tags:
name: dev-mode
- name: prod-mode
dev-mode:
type: gremlin
graphname: <<graphname:graph_wheels>>
script: >-
schema.config().option('graph.schema_mode').set('Development');
prod-mode:
tags:
phase: prod-mode
statements:
- prod-mode: >-
schema.config().option('graph.schema_mode').set('Production');
tags:
name: prod-mode
- name: main
prod-mode:
type: gremlin
graphname: <<graphname:graph_wheels>>
script: >-
schema.config().option('graph.schema_mode').set('Production');
rampup:
tags:
phase: main
phase: rampup
statements:
- main-add: >-
device = graph.addVertex(label, 'device','deviceid', {deviceid}, 'type', {type}, 'os', {os}, 'osversion', {osversion});
session = graph.addVertex(label, 'session', 'sessionid', {sessionid}, 'ipaddress', {ipaddress}, 'deviceid', {deviceid}, 'createdtime', {createdtime});
session.addEdge('using', device);
tags:
name: main-add
main-add:
type: gremlin
diag: "{diag_one_pct}"
graphname: <<graphname:graph_wheels>>
script: >-
device = g.addV('device')
.property('deviceid', '{deviceid}' as UUID)
.property('type', '{type}')
.property('os', '{os}')
.property('osversion', '{osversion}')
.as('d')
.addV('session')
.property('sessionid', '{sessionid}' as UUID)
.property('ipaddress', '{ipaddress}')
.property('deviceid', '{deviceid}' as UUID)
.property('createdtime', {createdtime})
.as('s')
.addE('using').from('s').to('d');