NameUtil.java

/*
 * Portions of this software was developed by employees of the National Institute
 * of Standards and Technology (NIST), an agency of the Federal Government and is
 * being made available as a public service. Pursuant to title 17 United States
 * Code Section 105, works of NIST employees are not subject to copyright
 * protection in the United States. This software may be subject to foreign
 * copyright. Permission in the United States and in foreign countries, to the
 * extent that NIST may hold copyright, to use, copy, modify, create derivative
 * works, and distribute this software and its documentation without fee is hereby
 * granted on a non-exclusive basis, provided that this notice and disclaimer
 * of warranty appears in all copies.
 *
 * THE SOFTWARE IS PROVIDED 'AS IS' WITHOUT ANY WARRANTY OF ANY KIND, EITHER
 * EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY
 * THAT THE SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND FREEDOM FROM
 * INFRINGEMENT, AND ANY WARRANTY THAT THE DOCUMENTATION WILL CONFORM TO THE
 * SOFTWARE, OR ANY WARRANTY THAT THE SOFTWARE WILL BE ERROR FREE.  IN NO EVENT
 * SHALL NIST BE LIABLE FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO, DIRECT,
 * INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES, ARISING OUT OF, RESULTING FROM,
 * OR IN ANY WAY CONNECTED WITH THIS SOFTWARE, WHETHER OR NOT BASED UPON WARRANTY,
 * CONTRACT, TORT, OR OTHERWISE, WHETHER OR NOT INJURY WAS SUSTAINED BY PERSONS OR
 * PROPERTY OR OTHERWISE, AND WHETHER OR NOT LOSS WAS SUSTAINED FROM, OR AROSE OUT
 * OF THE RESULTS OF, OR USE OF, THE SOFTWARE OR SERVICES PROVIDED HEREUNDER.
 */

/*
 * This code is based on https://github.com/eclipse-ee4j/jaxb-ri/blob/master/jaxb-ri/core/src/main/java/org/glassfish/jaxb/core/api/impl/NameUtil.java
 *
 * Copyright (c) 1997, 2022 Oracle and/or its affiliates. All rights reserved.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0, which is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package gov.nist.secauto.metaschema.databind.codegen.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

/**
 * Methods that convert strings into various formats.
 */
@SuppressWarnings("PMD")
class NameUtil {
  protected static boolean isPunct(char ch) {
    return ch == '-' || ch == '.' || ch == ':' || ch == '_' || ch == '·' || ch == '·' || ch == '۝' || ch == '۞';
  }

  /**
   * Capitalizes the first character of the specified string, and de-capitalize
   * the rest of characters.
   *
   * @param str
   *          the string to capitalize
   * @return the capitalized string
   */
  public static String capitalize(String str) {
    if (!Character.isLowerCase(str.charAt(0))) {
      return str;
    }
    StringBuilder sb = new StringBuilder(str.length());
    sb.append(String.valueOf(str.charAt(0)).toUpperCase(Locale.ENGLISH));
    sb.append(str.substring(1).toLowerCase(Locale.ENGLISH));
    return sb.toString();
  }

  // Precondition: s[start] is not punctuation
  @SuppressWarnings({
      "checkstyle:MissingSwitchDefaultCheck", // intentional
  })
  private static int nextBreak(String str, int start) {
    int len = str.length();

    char c1 = str.charAt(start);
    int t1 = classify(c1);

    for (int i = start + 1; i < len; i++) {
      // shift (c1,t1) into (c0,t0)
      // char c0 = c1; --- conceptually, but c0 won't be used
      int t0 = t1;

      c1 = str.charAt(i);
      t1 = classify(c1);

      switch (actionTable[t0 * 5 + t1]) {
      case ACTION_CHECK_PUNCT:
        if (isPunct(c1)) {
          return i;
        }
        break;
      case ACTION_CHECK_C2:
        if (i < len - 1) {
          char c2 = str.charAt(i + 1);
          if (Character.isLowerCase(c2)) {
            return i;
          }
        }
        break;
      case ACTION_BREAK:
        return i;
      }
    }
    return -1;
  }

  // the 5-category classification that we use in this code
  // to find work breaks
  protected static final int UPPER_LETTER = 0;
  protected static final int LOWER_LETTER = 1;
  protected static final int OTHER_LETTER = 2;
  protected static final int DIGIT = 3;
  protected static final int OTHER = 4;

  /**
   * Look up table for actions. type0*5+type1 would yield the action to be taken.
   */
  private static final byte[] actionTable = new byte[5 * 5];

  // action constants. see nextBreak for the meaning
  private static final byte ACTION_CHECK_PUNCT = 0;
  private static final byte ACTION_CHECK_C2 = 1;
  private static final byte ACTION_BREAK = 2;
  private static final byte ACTION_NOBREAK = 3;

  /**
   * Decide the action to be taken given the classification of the preceding
   * character 't0' and the classification of the next character 't1'.
   */
  private static byte decideAction(int t0, int t1) {
    if (t0 == OTHER && t1 == OTHER) {
      return ACTION_CHECK_PUNCT;
    }
    if ((t0 == DIGIT) ^ (t1 == DIGIT)) {
      return ACTION_BREAK;
    }
    if (t0 == LOWER_LETTER && t1 != LOWER_LETTER) {
      return ACTION_BREAK;
    }
    if ((t0 <= OTHER_LETTER) ^ (t1 <= OTHER_LETTER)) {
      return ACTION_BREAK;
    }
    if ((t0 == OTHER_LETTER) ^ (t1 == OTHER_LETTER)) {
      return ACTION_BREAK;
    }
    if (t0 == UPPER_LETTER && t1 == UPPER_LETTER) {
      return ACTION_CHECK_C2;
    }

    return ACTION_NOBREAK;
  }

  static {
    // initialize the action table
    for (int t0 = 0; t0 < 5; t0++) {
      for (int t1 = 0; t1 < 5; t1++) {
        actionTable[t0 * 5 + t1] = decideAction(t0, t1);
      }
    }
  }

  /**
   * Classify a character into 5 categories that determine the word break.
   *
   * @param ch
   *          the character
   * @return the categorization
   */
  protected static int classify(char ch) {
    switch (Character.getType(ch)) {
    case Character.UPPERCASE_LETTER:
      return UPPER_LETTER;
    case Character.LOWERCASE_LETTER:
      return LOWER_LETTER;
    case Character.TITLECASE_LETTER:
    case Character.MODIFIER_LETTER:
    case Character.OTHER_LETTER:
      return OTHER_LETTER;
    case Character.DECIMAL_DIGIT_NUMBER:
      return DIGIT;
    default:
      return OTHER;
    }
  }

  /**
   * Tokenizes a string into words and capitalizes the first character of each
   * word.
   * <p>
   * This method uses a change in character type as a splitter of two words. For
   * example, "abc100ghi" will be splitted into {"Abc", "100","Ghi"}.
   *
   * @param str
   *          the string to split into a word list
   * @return the word list
   */
  public static List<String> toWordList(String str) {
    ArrayList<String> retval = new ArrayList<>();
    int len = str.length();
    for (int i = 0; i < len;) {

      // Skip punctuation
      while (i < len) {
        if (!isPunct(str.charAt(i))) {
          break;
        }
        i++;
      }
      if (i >= len) {
        break;
      }

      // Find next break and collect word
      int breakPos = nextBreak(str, i);
      String word = (breakPos == -1) ? str.substring(i) : str.substring(i, breakPos);
      retval.add(escape(capitalize(word)));
      if (breakPos == -1) {
        break;
      }
      i = breakPos;
    }

    // we can't guarantee a valid Java identifier anyway,
    // so there's not much point in rejecting things in this way.
    // if (ss.size() == 0)
    // throw new IllegalArgumentException("Zero-length identifier");
    return retval;
  }

  protected static String toMixedCaseName(List<String> ss, boolean startUpper) {
    StringBuilder sb = new StringBuilder();
    if (!ss.isEmpty()) {
      sb.append(startUpper ? ss.get(0) : ss.get(0).toLowerCase(Locale.ENGLISH));
      for (int i = 1; i < ss.size(); i++) {
        sb.append(ss.get(i));
      }
    }
    return sb.toString();
  }

  protected static String toMixedCaseVariableName(String[] ss,
      boolean startUpper,
      boolean cdrUpper) {
    if (cdrUpper) {
      for (int i = 1; i < ss.length; i++) {
        ss[i] = capitalize(ss[i]);
      }
    }
    StringBuilder sb = new StringBuilder();
    if (ss.length > 0) {
      sb.append(startUpper ? ss[0] : ss[0].toLowerCase(Locale.ENGLISH));
      for (int i = 1; i < ss.length; i++) {
        sb.append(ss[i]);
      }
    }
    return sb.toString();
  }

  /**
   * Formats a string into "THIS_KIND_OF_FORMAT_ABC_DEF".
   *
   * @param str
   *          the string to format
   * @return Always return a string but there's no guarantee that the generated
   *         code is a valid Java identifier.
   */
  public String toConstantName(String str) {
    return toConstantName(toWordList(str));
  }

  /**
   * Formats a string into "THIS_KIND_OF_FORMAT_ABC_DEF".
   *
   * @param ss
   *          a list of words
   * @return Always return a string but there's no guarantee that the generated
   *         code is a valid Java identifier.
   */
  public String toConstantName(List<String> ss) {
    StringBuilder sb = new StringBuilder();
    if (!ss.isEmpty()) {
      sb.append(ss.get(0).toUpperCase(Locale.ENGLISH));
      for (int i = 1; i < ss.size(); i++) {
        sb.append('_');
        sb.append(ss.get(i).toUpperCase(Locale.ENGLISH));
      }
    }
    return sb.toString();
  }

  /**
   * Escapes characters is the given string so that they can be printed by only
   * using US-ASCII characters.
   *
   * The escaped characters will be appended to the given StringBuffer.
   *
   * @param sb
   *          StringBuffer that receives escaped string.
   * @param str
   *          String to be escaped. <code>s.substring(start)</code> will be
   *          escaped and copied to the string buffer.
   * @param start
   *          the starting position in the string
   */
  @SuppressWarnings({
      "checkstyle:MissingSwitchDefaultCheck", // intentional
      "checkstyle:AvoidEscapedUnicodeCharactersCheck" // ok
  })
  public static void escape(StringBuilder sb, String str, int start) {
    int len = str.length();
    for (int i = start; i < len; i++) {
      char ch = str.charAt(i);
      if (Character.isJavaIdentifierPart(ch)) {
        sb.append(ch);
      } else {
        sb.append('_');
        if (ch <= '\u000f') {
          sb.append("000");
        } else if (ch <= '\u00ff') {
          sb.append("00");
        } else if (ch <= '\u0fff') {
          sb.append('0');
        }
        sb.append(Integer.toString(ch, 16));
      }
    }
  }

  /**
   * Escapes characters that are unusable as Java identifiers by replacing unsafe
   * characters with safe characters.
   */
  private static String escape(String str) {
    int len = str.length();
    for (int i = 0; i < len; i++) {
      if (!Character.isJavaIdentifierPart(str.charAt(i))) {
        StringBuilder sb = new StringBuilder(str.substring(0, i));
        escape(sb, str, i);
        return sb.toString();
      }
    }
    return str;
  }
}