{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"arrayShape": "cr:arrayShape",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isArray": "cr:isArray",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"containedIn": "cr:containedIn"
},
"@type": "sc:Dataset",
"name": "chili100k_strat: Dataset to train or fine-tune CrystaLLM-pi for the targeted generation of experimental materials conditioned on XRD profiles",
"description": "The dataset contains experimentally determined crystal structures sourced from [Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://github.com/UlrikFriisJensen/CHILI) as described in [CHILI: Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://dx.doi.org/10.1145/3637528.3671538), a curated and filtered subset of the Crystallography Open Database [COD](https://www.crystallography.net/cod/). The structural data underwent text augmentation as per the pre-processing pipeline in [CrystaLLM-pi](https://github.com/C-Bone-UCL/CrystaLLM-pi). Each structure was labelled with its theoretical X-ray diffraction (XRD) pattern. The complete dataset, published on Hugging Face (https://huggingface.co/datasets/c-bone/chili100k_strat), can be used to train or fine-tune CrystaLLM-pi for the targeted generation of materials conditioned on XRD profiles.",
"alternateName": [
"c-bone/COD_XRD_small_nohc"
],
"conformsTo": "http://mlcommons.org/croissant/1.1",
"citeAs": "Cyprien Bone. Dataset. 2026. chili100k_strat. [accessed YYYY-MM-DD].",
"creator": {
"@type": "Person",
"name": "Cyprien Bone",
"url": "https://huggingface.co/c-bone"
},
"keywords": [
"1K - 10K",
"parquet",
"Text",
"Datasets",
"pandas",
"Croissant",
"Polars",
"US Region: US",
"CrystaLLM-pi",
"Crystallography Open Database (COD)",
"CIF",
"condition vector",
"crystal"
],
"datePublished": "2026-04-24T00:00:00Z",
"url": "https://huggingface.co/datasets/c-bone/chili100k_strat",
"version": "1.0.0",
"rai:dataCollection": "See dataCollectionRawData then dataPreprocessingProtocol then structural data underwent text augmentation as per the pre-processing pipeline in CrystaLLM-pi (https://github.com/C-Bone-UCL/CrystaLLM-pi). Each structure was labelled with its theoretical X-ray diffraction (XRD) pattern.",
"rai:dataCollectionType": [
"Synthetic",
"Experimental"
],
"rai:dataCollectionMissingData": "Not applicable",
"rai:dataCollectionRawData": "Data originally from Crystallography Open Database (COD).",
"rai:dataAnnotationProtocol": "Not applicable",
"rai:dataAnnotationPlatform": "Not applicable",
"rai:dataAnnotationAnalysis": "Not applicable",
"rai:annotatorDemographics": "Not applicable",
"rai:machineAnnotationTools": "Not applicable",
"rai:annotationsPerItem": "Not applicable",
"rai:dataPreprocessingProtocol": [
"The dataset contains experimentally determined crystal structures sourced from [Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://github.com/UlrikFriisJensen/CHILI) as described in [CHILI: Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://dx.doi.org/10.1145/3637528.3671538), a curated and filtered subset of the Crystallography Open Database [COD](https://www.crystallography.net/cod/)."
],
"rai:dataManipulationProtocol": "The dataset was made so that in the test set, we have: 500 materials whose structures were seen in training (is_novel == False); 500 materials whose atomic composition (reduced formula) was seen in training, but structure was never seen (so a polymorph, is_novel == True but is_comp_novel == False flags); 500 materials whose atomic composition was never seen in any training phase (is_comp_novel == True)",
"rai:dataImputationProtocol": "Not applicable",
"rai:dataUseCases": [
"Dataset to train or fine-tune CrystaLLM-pi (https://github.com/C-Bone-UCL/CrystaLLM-pi)"
],
"rai:dataBiases": [
"Not applicable"
],
"rai:personalSensitiveInformation": [
"No personal or sensitive information is included in the data."
],
"rai:dataSocialImpact": "Not applicable",
"rai:dataLimitations": [
"This dataset is 14K materials, but note that CrystaLLM model performance is best when training on datasets with over ~40K"
],
"rai:dataReleaseMaintenancePlan": "The data are being released as a one off with no immediate plans for revisions.",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "train-00000-of-00001_fileobject",
"name": "train-00000-of-00001.parquet",
"description": "Training subset of dataset (11091 records)",
"contentSize": "618485 B",
"contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/train-00000-of-00001.parquet?download=true",
"encodingFormat": "application/x-parquet",
"sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
},
{
"@type": "cr:FileObject",
"@id": "test-00000-of-00001_fileobject",
"name": "test-00000-of-00001.parquet",
"description": "Test subset of dataset (1500 records)",
"contentSize": "107786 B",
"contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/test-00000-of-00001.parquet?download=true",
"encodingFormat": "application/x-parquet",
"sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
},
{
"@type": "cr:FileObject",
"@id": "validation-00000-of-00001_fileobject",
"name": "validation-00000-of-00001.parquet",
"description": "Validation subset of dataset (1500 records)",
"contentSize": "107590 B",
"contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/validation-00000-of-00001.parquet?download=true",
"encodingFormat": "application/x-parquet",
"sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
},
{
"@type": "cr:FileSet",
"@id": "parquet-files-for-config-default",
"description": "Croissant FileSet including training, test and validation datasets as separate parquet files. Each file contains data exported from Crystallography Open Database (COD) which can be used to pre-train CrystaLLM-pi. Each file is in the same format and contains the crystal material ID, reduced_Formula, contents of CIF file, and condition vector",
"cr:FileObject": [
{
"@id": "instances_annotations_val_fileobject"
}
],
"encodingFormat": "application/x-parquet",
"includes": "*.parquet"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"dataType": "cr:Split",
"key": {
"@id": "default_splits/split_name"
},
"@id": "default_splits",
"name": "default_splits",
"description": "Splits for the default config.",
"field": [
{
"@type": "cr:Field",
"@id": "default_splits/split_name",
"dataType": "sc:Text"
}
],
"data": [
{
"default_splits/split_name": "train"
},
{
"default_splits/split_name": "validation"
},
{
"default_splits/split_name": "test"
}
]
},
{
"@type": "cr:RecordSet",
"@id": "default",
"description": "c-bone/COD_XRD_small_nohc - 'default' subset\n\nAdditional information:\n- 3 splits: train, validation, test",
"field": [
{
"@type": "cr:Field",
"@id": "default/split",
"name": "splits",
"description": "Split of the dataset into enumerated values test/train/validation",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"fileProperty": "fullpath"
},
"transform": {
"regex": "^\\\\data\\\\(train|validation|test).+.parquet$"
}
},
"references": {
"field": {
"@id": "default_splits/split_name"
}
}
},
{
"@type": "cr:Field",
"@id": "default/database",
"name": "Database",
"description": "Source database that this crystal is from. This whole dataset is extracted from the database COD (Crystallography Open Database).",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "Database"
}
}
},
{
"@type": "cr:Field",
"@id": "default/material_id",
"name": "Material ID",
"description": "ID of crystal material in source database.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "Material ID"
}
}
},
{
"@type": "cr:Field",
"@id": "default/reduced_formula",
"name": "Reduced Formula",
"description": "Reduced formula of crystal.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "Reduced Formula"
}
}
},
{
"@type": "cr:Field",
"@id": "default/cif",
"name": "CIF",
"description": "Contents of crystal's CIF (Crystallographic Information File, as defined in https://www.iucr.org/resources/cif/spec/version1.1) as text.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "CIF"
}
}
},
{
"@type": "cr:Field",
"@id": "default/condition_vector",
"name": "Condition Vector",
"description": "High-dimensional numerical representation XRD of top 20 most intense XRD (X-ray diffraction) peaks of the crystal used to identify or reconstruct a crystal structure. It consists of a combined set of 40 values: the 20 highest peak positions (2 theta angles) and their corresponding 20 associated peak intensities (int). Normalisations are 2theta min-max for 0,90 and intensities min-max for 0,100",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "condition_vector"
}
}
},
{
"@type": "cr:Field",
"@id": "default/is_novel",
"name": "Is Novel?",
"description": "Indicates whether material's structures have been seen in training",
"dataType": "sc:Boolean",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "is_novel"
}
}
},
{
"@type": "cr:Field",
"@id": "default/is_comp_novel",
"name": "Is Composition Novel?",
"description": "Indicates whether material's\u00a0atomic composition (reduced formula) was seen in training, but structure was never seen (so a polymorph, is_novel == True but is_comp_novel == False flags)",
"dataType": "sc:Boolean",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "is_comp_novel"
}
}
},
{
"@type": "cr:Field",
"@id": "default/token_count",
"name": "Token Count",
"description": "Token count",
"dataType": "sc:Boolean",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "token_count"
}
}
}
]
}
]
}