threshold work processes:

  • check input: InstructionFileJob
  • check output: Instruction
/**
 * The `ThresholdChecker` class is responsible for determining whether a given job or instruction
 * meets the threshold criteria for processing. It utilizes various criteria, including file type,
 * code complexity, file size, and token length, to make these determinations.
 *
 * @property context The worker context providing configuration settings for the threshold checks.
 */
class ThresholdChecker(private val context: WorkerContext) {
    private var registry: EncodingRegistry = Encodings.newDefaultEncodingRegistry()
    private var enc: Encoding = registry.getEncoding(EncodingType.CL100K_BASE)

    private val pipeline: Pipeline<FileSummary>
        get() {
            return Pipeline<FileSummary>()
                .addFilter(ExtensionFilter(context.qualityThreshold))
                .addFilter(ComplexityFilter(context.qualityThreshold))
                .addFilter(BinaryGeneratedMinifiedFilter(context.qualityThreshold))
                .addFilter(SizeFilter(context.qualityThreshold))
                .addFilter(TokenLengthFilter(context.qualityThreshold))
        }

    /**
     * Checks if the given job meets the threshold criteria for processing.
     *
     * @param job The instruction file job to be checked.
     * @return Returns true if the job meets the threshold criteria, false otherwise.
     */
    fun isMetThreshold(job: InstructionFileJob): Boolean {
        return pipeline.process(job.fileSummary)
    }

    /**
     * Determines whether the given instruction meets the threshold criteria.
     *
     * @param ins the instruction to be evaluated
     * @return true if the instruction meets the threshold criteria, false otherwise
     */
    fun isMetThreshold(ins: Instruction): Boolean {
        // skip empty instruction
        if (ins.input.isEmpty() || ins.output.isEmpty()) {
            return false
        }

        // limit by token length
        val totalToken = enc.encode(ins.instruction + ins.input + ins.output).size
        return totalToken <= context.qualityThreshold.maxTokenLength
    }
}

Default threshold

@Serializable
data class InsQualityThreshold(
    val complexity: Int = MAX_COMPLEXITY,
    val fileSize: Int = MAX_FILE_SIZE,
    /**
     * https://docs.sweep.dev/blogs/chunking-2m-files
     * This is because the average token to a character ratio for code is ~1:5(300 tokens), and embedding models are
     *  capped at 512 tokens. Further, 1500 characters correspond approximately to 40 lines, roughly equivalent to a
     *  small to medium-sized function or class.
     *
     * Our token length is 1024, so we can use 1500 * 1024 / 512 = 1500
     */
    val maxCharInCode: Int = MAX_CHAR_IN_CODE,
    /**
     * Our token length is 1024, so we can use 40 * 2048 / 512 = 320
     */
    val maxLineInCode: Int = MAX_LINE_IN_CODE,
    val badsmellThreshold: Map<String, Int> = BsThresholds().toThresholds(),
    val maxTokenLength: Int = MAX_TOKEN_LENGTH,
) {
    companion object {
        const val MAX_TOKEN_LENGTH: Int = 2048
        const val MAX_COMPLEXITY: Int = 1000
        const val MAX_PROJECT_TYPED_COMPLETION_SIZE: Int = 1000
        const val MAX_FILE_SIZE: Int = 1024 * 64
        const val MAX_LINE_IN_CODE: Int = 320
        const val MAX_CHAR_IN_CODE: Int = 1500
        const val MAX_RELATED_CODE_LINE: Int = 30
    }
}

BsThresholds

data class BsThresholds(
    val bsLongParasLength: Int = 5,
    val bsIfSwitchLength: Int = 8,
    val bsLargeLength: Int = 20,
    val bsMethodLength: Int = 30,
    val bsIfLinesLength: Int = 3,
)

Others

Complexity & FileSize Metric

Filename Complexity Code Lines Size
List.java 86 2387 75079
Stream.java 53 2007 68523
Property.java 34 1313 68781
Commander.java 103 607 26801
ConfigConverter.java 241 1081 52188
ConfigConverterTest.java 0 2183 109774
CachedGoConfigIntegrationTest.java 10 1307 76313
GoConfigMigrationIntegrationTest.java 0 2224 115253
GoConfigMigratorIntegrationTest.java 4 1513 81410
BasicCruiseConfig.java 225 1628 62427
JobConfig.java 118 565 23309
PipelineConfig.java 147 998 36415
ConfigFileFixture.java 1 1825 85626
MagicalGoConfigXmlLoaderTest.java 2 4394 230409
PipelineSqlMapDao.java 120 972 45329
MaterialRepository.java 135 1108 54763
AgentServiceTest.java 0 1543 75019
PipelineSqlMapDaoIntegrationTest.java 19 1778 94116
StageSqlMapDaoIntegrationTest.java 26 1926 98180
MaterialRepositoryIntegrationTest.java 10 1718 93771
AgentServiceIntegrationTest.java 2 1413 70354
AutoTriggerDependencyResolutionTest.java 3 2099 93564
PipelineConfigServiceIntegrationTest.java 2 1102 80678
SwaggerSpecificationCreator.java 217 1237 45240
ConversionUtil.java 123 579 20911
ObservationSearchQueryTest.java 21 1930 76763
ObservationFhirResourceProviderIntegrationTest.java 12 1503 69780
ObservationFhirResourceProviderIntegrationTest.java 12 1660 76629
BaseDao.java 190 1224 47815
ImmunizationTranslatorImpl.java 105 442 15772
RestServiceImpl.java 114 715 29624
BaseDelegatingResource.java 109 851 31573