threshold work processes:
- check input: InstructionFileJob
- check output: Instruction
/**
* The `ThresholdChecker` class is responsible for determining whether a given job or instruction
* meets the threshold criteria for processing. It utilizes various criteria, including file type,
* code complexity, file size, and token length, to make these determinations.
*
* @property context The worker context providing configuration settings for the threshold checks.
*/
class ThresholdChecker(private val context: WorkerContext) {
private var registry: EncodingRegistry = Encodings.newDefaultEncodingRegistry()
private var enc: Encoding = registry.getEncoding(EncodingType.CL100K_BASE)
private val pipeline: Pipeline<FileSummary>
get() {
return Pipeline<FileSummary>()
.addFilter(ExtensionFilter(context.qualityThreshold))
.addFilter(ComplexityFilter(context.qualityThreshold))
.addFilter(BinaryGeneratedMinifiedFilter(context.qualityThreshold))
.addFilter(SizeFilter(context.qualityThreshold))
.addFilter(TokenLengthFilter(context.qualityThreshold))
}
/**
* Checks if the given job meets the threshold criteria for processing.
*
* @param job The instruction file job to be checked.
* @return Returns true if the job meets the threshold criteria, false otherwise.
*/
fun isMetThreshold(job: InstructionFileJob): Boolean {
return pipeline.process(job.fileSummary)
}
/**
* Determines whether the given instruction meets the threshold criteria.
*
* @param ins the instruction to be evaluated
* @return true if the instruction meets the threshold criteria, false otherwise
*/
fun isMetThreshold(ins: Instruction): Boolean {
// skip empty instruction
if (ins.input.isEmpty() || ins.output.isEmpty()) {
return false
}
// limit by token length
val totalToken = enc.encode(ins.instruction + ins.input + ins.output).size
return totalToken <= context.qualityThreshold.maxTokenLength
}
}
Default threshold
@Serializable
data class InsQualityThreshold(
val complexity: Int = MAX_COMPLEXITY,
val fileSize: Int = MAX_FILE_SIZE,
/**
* https://docs.sweep.dev/blogs/chunking-2m-files
* This is because the average token to a character ratio for code is ~1:5(300 tokens), and embedding models are
* capped at 512 tokens. Further, 1500 characters correspond approximately to 40 lines, roughly equivalent to a
* small to medium-sized function or class.
*
* Our token length is 1024, so we can use 1500 * 1024 / 512 = 1500
*/
val maxCharInCode: Int = MAX_CHAR_IN_CODE,
/**
* Our token length is 1024, so we can use 40 * 2048 / 512 = 320
*/
val maxLineInCode: Int = MAX_LINE_IN_CODE,
val badsmellThreshold: Map<String, Int> = BsThresholds().toThresholds(),
val maxTokenLength: Int = MAX_TOKEN_LENGTH,
) {
companion object {
const val MAX_TOKEN_LENGTH: Int = 2048
const val MAX_COMPLEXITY: Int = 1000
const val MAX_PROJECT_TYPED_COMPLETION_SIZE: Int = 1000
const val MAX_FILE_SIZE: Int = 1024 * 64
const val MAX_LINE_IN_CODE: Int = 320
const val MAX_CHAR_IN_CODE: Int = 1500
const val MAX_RELATED_CODE_LINE: Int = 30
}
}
BsThresholds
data class BsThresholds(
val bsLongParasLength: Int = 5,
val bsIfSwitchLength: Int = 8,
val bsLargeLength: Int = 20,
val bsMethodLength: Int = 30,
val bsIfLinesLength: Int = 3,
)
Others
Complexity & FileSize Metric
Filename | Complexity | Code Lines | Size |
---|---|---|---|
List.java | 86 | 2387 | 75079 |
Stream.java | 53 | 2007 | 68523 |
Property.java | 34 | 1313 | 68781 |
Commander.java | 103 | 607 | 26801 |
ConfigConverter.java | 241 | 1081 | 52188 |
ConfigConverterTest.java | 0 | 2183 | 109774 |
CachedGoConfigIntegrationTest.java | 10 | 1307 | 76313 |
GoConfigMigrationIntegrationTest.java | 0 | 2224 | 115253 |
GoConfigMigratorIntegrationTest.java | 4 | 1513 | 81410 |
BasicCruiseConfig.java | 225 | 1628 | 62427 |
JobConfig.java | 118 | 565 | 23309 |
PipelineConfig.java | 147 | 998 | 36415 |
ConfigFileFixture.java | 1 | 1825 | 85626 |
MagicalGoConfigXmlLoaderTest.java | 2 | 4394 | 230409 |
PipelineSqlMapDao.java | 120 | 972 | 45329 |
MaterialRepository.java | 135 | 1108 | 54763 |
AgentServiceTest.java | 0 | 1543 | 75019 |
PipelineSqlMapDaoIntegrationTest.java | 19 | 1778 | 94116 |
StageSqlMapDaoIntegrationTest.java | 26 | 1926 | 98180 |
MaterialRepositoryIntegrationTest.java | 10 | 1718 | 93771 |
AgentServiceIntegrationTest.java | 2 | 1413 | 70354 |
AutoTriggerDependencyResolutionTest.java | 3 | 2099 | 93564 |
PipelineConfigServiceIntegrationTest.java | 2 | 1102 | 80678 |
SwaggerSpecificationCreator.java | 217 | 1237 | 45240 |
ConversionUtil.java | 123 | 579 | 20911 |
ObservationSearchQueryTest.java | 21 | 1930 | 76763 |
ObservationFhirResourceProviderIntegrationTest.java | 12 | 1503 | 69780 |
ObservationFhirResourceProviderIntegrationTest.java | 12 | 1660 | 76629 |
BaseDao.java | 190 | 1224 | 47815 |
ImmunizationTranslatorImpl.java | 105 | 442 | 15772 |
RestServiceImpl.java | 114 | 715 | 29624 |
BaseDelegatingResource.java | 109 | 851 | 31573 |