Shuf [随机样本]
# 介绍
给定一个未知大小的数据集,按随机顺序获得一个随机样本。这通常被用于数据分析,作为一种从数据池或仓库或 CSV 文件中获取小的随机样本的方法。
# 实现
# JavaScript
/*
Given a data set of an unknown size, Get a random sample in a random order
It's used in data analytics, often as a way to get a small random sample from a data lake or warehouse, or from a large CSV file
*/
function shuf (datasetSource, sampleSize) {
const output = fillBaseSample(datasetSource, sampleSize)
return randomizeOutputFromDataset(datasetSource, output)
}
/**
* Fills the output if possible, with the minimum number of values
* @param {Iterable.<T>} datasetSource The iterable source of data
* @param {number} sampleSize The size of the sample to extract from the dataset
* @returns {Array.<T>} The random sample, as an array
* @template T
*/
function fillBaseSample (datasetSource, sampleSize) {
let filledIndexes = []
let output = new Array(sampleSize)
// Spread data out filling the array
while (true) {
const iterator = datasetSource.next()
if (iterator.done) break
let insertTo = Math.floor(Math.random() * output.length)
while (filledIndexes.includes(insertTo)) {
insertTo++
if (insertTo === output.length) {
insertTo = 0
}
}
output[insertTo] = {
value: iterator.value
}
filledIndexes = [...filledIndexes, insertTo]
if (filledIndexes.length === sampleSize) {
break
}
}
if (filledIndexes.length < output.length) {
// Not a large enough dataset to fill the sample - trim empty values
output = output.filter((_, i) => filledIndexes.includes(i))
}
return output.map((o) => o.value)
}
/**
* Replaces values in the output randomly with new ones from the dataset
* @param {Iterable.<T>} datasetSource The iterable source of data
* @param {Array.<T>} output The output so far, filled with data
* @returns {Array.<T>} The random sample, as an array
* @template T
*/
function randomizeOutputFromDataset (datasetSource, output) {
const newOutput = [...output]
let readSoFar = output.length
while (true) {
const iterator = datasetSource.next()
if (iterator.done) break
readSoFar++
const insertTo = Math.floor(Math.random() * readSoFar)
if (insertTo < newOutput.length) {
newOutput[insertTo] = iterator.value
}
}
return newOutput
}
// Example
/**
* Generates a random range of data, with values between 0 and 2^31 - 1
* @param {number} length The number of data items to generate
* @returns {Iterable<number>} Random iterable data
*/
function * generateRandomData (length) {
const maxValue = Math.pow(2, 31) - 1
for (let i = 0; i < length; i++) {
yield Math.floor(Math.random() * maxValue)
}
}
// const source = generateRandomData(1000)
// const result = shuf(source, 10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# 参考
编辑 (opens new window)
上次更新: 2022/10/27, 20:28:55